Spaces:

CompactAI
/

AIFinder

Running

App Files Files Community

AIFinder / config.py

CompactAI

Upload 8 files

17ef86f verified 1 day ago

raw

history blame contribute delete

4.24 kB

	"""
	AIFinder Configuration
	Dataset registry, label mappings, and feature parameters.
	"""

	import os

	# --- Paths ---
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	MODEL_DIR = os.path.join(BASE_DIR, "models")

	# --- Dataset Registry ---
	# Each entry: (hf_dataset_id, provider, model_name, optional_kwargs)
	# optional_kwargs: subset name, split, etc.
	DATASET_REGISTRY = [
	# Anthropic
	("TeichAI/claude-4.5-opus-high-reasoning-250x", "Anthropic", "Claude 4.5 Opus", {}),
	(
	"TeichAI/claude-sonnet-4.5-high-reasoning-250x",
	"Anthropic",
	"Claude Sonnet 4.5",
	{},
	),
	(
	"Roman1111111/claude-opus-4.6-10000x",
	"Anthropic",
	"Claude Opus 4.6",
	{"max_samples": 1500},
	),
	# OpenAI
	("TeichAI/gpt-5.2-high-reasoning-250x", "OpenAI", "GPT-5.2", {}),
	("TeichAI/gpt-5.1-high-reasoning-1000x", "OpenAI", "GPT-5.1", {}),
	("TeichAI/gpt-5.1-codex-max-1000x", "OpenAI", "GPT-5.1 Codex Max", {}),
	("TeichAI/gpt-5-codex-250x", "OpenAI", "GPT-5 Codex", {}),
	("TeichAI/gpt-5-codex-1000x", "OpenAI", "GPT-5 Codex", {}),
	# Google
	("TeichAI/gemini-3-pro-preview-high-reasoning-1000x", "Google", "Gemini 3 Pro", {}),
	("TeichAI/gemini-3-pro-preview-high-reasoning-250x", "Google", "Gemini 3 Pro", {}),
	(
	"TeichAI/gemini-2.5-flash-11000x",
	"Google",
	"Gemini 2.5 Flash",
	{"max_samples": 1500},
	),
	("TeichAI/Gemini-3-Flash-Preview-VIBE", "Google", "Gemini 3 Flash", {}),
	("TeichAI/gemini-3-flash-preview-1000x", "Google", "Gemini 3 Flash", {}),
	("TeichAI/gemini-3-flash-preview-complex-1000x", "Google", "Gemini 3 Flash", {}),
	# xAI
	("TeichAI/brainstorm-v3.1-grok-4-fast-200x", "xAI", "Grok 4 Fast", {}),
	(
	"TeichAI/sherlock-thinking-alpha-11000x",
	"xAI",
	"Grok 4.1 Fast",
	{"max_samples": 1500},
	),
	("TeichAI/sherlock-dash-alpha-1000x", "xAI", "Grok 4.1 Fast", {}),
	("TeichAI/sherlock-think-alpha-1000x", "xAI", "Grok 4.1 Fast", {}),
	("TeichAI/grok-code-fast-1-1000x", "xAI", "Grok Code Fast 1", {}),
	# MoonshotAI
	("TeichAI/kimi-k2-thinking-250x", "MoonshotAI", "Kimi K2", {}),
	("TeichAI/kimi-k2-thinking-1000x", "MoonshotAI", "Kimi K2", {}),
	# Mistral
	("TeichAI/mistral-small-creative-500x", "Mistral", "Mistral Small", {}),
	# MiniMax
	("TeichAI/MiniMax-M2.1-Code-SFT", "MiniMax", "MiniMax M2.1", {"max_samples": 1500}),
	("TeichAI/convo-v1", "MiniMax", "MiniMax M2.1", {}),
	# StepFun
	(
	"TeichAI/Step-3.5-Flash-2600x",
	"StepFun",
	"Step 3.5 Flash",
	{"max_samples": 1500},
	),
	# Zhipu
	("TeichAI/Pony-Alpha-15k", "Zhipu", "GLM-5", {"max_samples": 1500}),
	# DeepSeek (TeichAI)
	("TeichAI/deepseek-v3.2-speciale-1000x", "DeepSeek", "DeepSeek V3.2 Speciale", {}),
	(
	"TeichAI/deepseek-v3.2-speciale-openr1-math-3k",
	"DeepSeek",
	"DeepSeek V3.2 Speciale",
	{"max_samples": 1500},
	),
	]

	# DeepSeek (a-m-team) — different format, handled separately
	DEEPSEEK_AM_DATASETS = [
	(
	"a-m-team/AM-DeepSeek-R1-Distilled-1.4M",
	"DeepSeek",
	"DeepSeek R1",
	{"name": "am_0.9M", "max_samples": 1000},
	),
	]

	# Conversational datasets disabled
	CONVERSATIONAL_DATASETS = []

	# --- All providers and models ---
	PROVIDERS = [
	"Anthropic",
	"OpenAI",
	"Google",
	"xAI",
	"MoonshotAI",
	"Mistral",
	"MiniMax",
	"StepFun",
	"Zhipu",
	"DeepSeek",
	]

	# --- Feature parameters ---
	TFIDF_WORD_PARAMS = {
	"analyzer": "word",
	"ngram_range": (1, 2),
	"max_features": 20,
	"sublinear_tf": True,
	"min_df": 3,
	"max_df": 0.7,
	}

	TFIDF_CHAR_PARAMS = {
	"analyzer": "char_wb",
	"ngram_range": (2, 4),
	"max_features": 20,
	"sublinear_tf": True,
	"min_df": 3,
	"max_df": 0.7,
	"smooth_idf": True,
	}

	# Equal samples per provider
	MAX_SAMPLES_PER_PROVIDER = 1000

	# --- Train/val/test split ---
	TEST_SIZE = 0.15
	VAL_SIZE = 0.10
	RANDOM_STATE = 42

	# --- Neural Network ---
	HIDDEN_DIM = 256
	EMBED_DIM = 128
	DROPOUT = 0.7
	BATCH_SIZE = 128
	EPOCHS = 80
	EARLY_STOP_PATIENCE = 25
	LEARNING_RATE = 3e-5
	WEIGHT_DECAY = 8e-2
	LABEL_SMOOTHING = 0.3