Text model name parser for grouping and analysis.
Provides functions to parse text generation model names into structured components
like base name, size, variant, and quantization. Useful for grouping model variants
(e.g., different quant versions of the same base model).
SIZE_PATTERNS
module-attribute
SIZE_PATTERNS = [
"(?<![a-zA-Z])(\\d+\\.?\\d*[BMK]\\d*)(?![a-zA-Z])",
"(?<![a-zA-Z])(\\d+x\\d+[BMK])(?![a-zA-Z])",
]
VERSION_PATTERNS
module-attribute
VERSION_PATTERNS = [
"(?<![a-zA-Z0-9])([vV]\\d+(?:\\.\\d+)+)(?![a-zA-Z0-9])",
"(?<![a-zA-Z0-9])([vV]\\d+)(?![a-zA-Z0-9.])",
]
VARIANT_PATTERNS
module-attribute
VARIANT_PATTERNS = [
"\\b(Instruct|Chat|Code|Base|Uncensored|Finetune|FT)\\b",
"\\b(turbo|preview|latest)\\b",
]
QUANT_PATTERNS
module-attribute
QUANT_PATTERNS = [
"\\b(Q[2-8]_K(?:_[SMLH])?)\\b",
"\\b(Q[2-8]_[01])\\b",
"\\b(Q[2-8])\\b",
"\\b(GGUF|GGML|GPTQ|AWQ|EXL2)\\b",
"\\b(fp16|fp32|int8|int4)\\b",
]
SEPARATORS
module-attribute
SEPARATORS = ['-', '_', ' ', '.']
ParsedTextModelName
dataclass
Structured representation of a parsed text model name.
Attributes:
-
original_name
(str)
–
The original model name as provided.
-
base_name
(str)
–
The base model name without size/variant/quant/version info.
-
size
(str | None)
–
Model size if detected (e.g., "7B", "13B", "70B", "7B1").
-
variant
(str | None)
–
Model variant if detected (e.g., "Instruct", "Chat", "Code").
-
quant
(str | None)
–
Quantization type if detected (e.g., "Q4", "Q8", "GGUF").
-
version
(str | None)
–
Model version if detected (e.g., "v0.1", "v2.1").
-
normalized_name
(str | None)
–
A normalized version of the name for comparison.
Source code in src/horde_model_reference/analytics/text_model_parser.py
| @dataclass
class ParsedTextModelName:
"""Structured representation of a parsed text model name.
Attributes:
original_name: The original model name as provided.
base_name: The base model name without size/variant/quant/version info.
size: Model size if detected (e.g., "7B", "13B", "70B", "7B1").
variant: Model variant if detected (e.g., "Instruct", "Chat", "Code").
quant: Quantization type if detected (e.g., "Q4", "Q8", "GGUF").
version: Model version if detected (e.g., "v0.1", "v2.1").
normalized_name: A normalized version of the name for comparison.
"""
original_name: str
base_name: str
size: str | None = None
variant: str | None = None
quant: str | None = None
version: str | None = None
normalized_name: str | None = None
|
original_name
instance-attribute
base_name
instance-attribute
size
class-attribute
instance-attribute
variant
class-attribute
instance-attribute
variant: str | None = None
quant
class-attribute
instance-attribute
version
class-attribute
instance-attribute
version: str | None = None
normalized_name
class-attribute
instance-attribute
normalized_name: str | None = None
__init__
__init__(
original_name: str,
base_name: str,
size: str | None = None,
variant: str | None = None,
quant: str | None = None,
version: str | None = None,
normalized_name: str | None = None,
) -> None
TextModelGroup
dataclass
Represents a group of text model variants sharing the same base model.
Attributes:
Source code in src/horde_model_reference/analytics/text_model_parser.py
| @dataclass
class TextModelGroup:
"""Represents a group of text model variants sharing the same base model.
Attributes:
base_name: The base model name.
variants: List of full model names that are variants of the base model.
"""
base_name: str
variants: list[str]
|
base_name
instance-attribute
variants
instance-attribute
__init__
__init__(base_name: str, variants: list[str]) -> None
NameFormatSchema
dataclass
Describes the naming convention inferred from a group of models.
Used by compose_name to produce names consistent with existing group members.
Source code in src/horde_model_reference/analytics/text_model_parser.py
| @dataclass
class NameFormatSchema:
"""Describes the naming convention inferred from a group of models.
Used by compose_name to produce names consistent with existing group members.
"""
separator: str = "-"
part_order: list[str] = field(default_factory=lambda: ["base", "size", "variant", "version", "quant"])
author_included: bool = False
common_author: str | None = None
template: str = "{base}-{size}"
|
separator
class-attribute
instance-attribute
part_order
class-attribute
instance-attribute
part_order: list[str] = field(
default_factory=lambda: [
"base",
"size",
"variant",
"version",
"quant",
]
)
author_included
class-attribute
instance-attribute
author_included: bool = False
common_author
class-attribute
instance-attribute
common_author: str | None = None
template
class-attribute
instance-attribute
template: str = '{base}-{size}'
__init__
__init__(
separator: str = "-",
part_order: list[str] = (
lambda: [
"base",
"size",
"variant",
"version",
"quant",
]
)(),
author_included: bool = False,
common_author: str | None = None,
template: str = "{base}-{size}",
) -> None
TextModelGroupSummary
dataclass
Aggregated metadata for a group of text model variants.
Source code in src/horde_model_reference/analytics/text_model_parser.py
| @dataclass
class TextModelGroupSummary:
"""Aggregated metadata for a group of text model variants."""
group_name: str
member_count: int
available_sizes: list[str]
available_quants: list[str]
common_baseline: str | None
any_nsfw: bool
any_has_description: bool
merged_tags: list[str]
name_format: NameFormatSchema
|
group_name
instance-attribute
member_count
instance-attribute
available_sizes
instance-attribute
available_sizes: list[str]
available_quants
instance-attribute
available_quants: list[str]
common_baseline
instance-attribute
common_baseline: str | None
any_nsfw
instance-attribute
any_has_description
instance-attribute
any_has_description: bool
merged_tags
instance-attribute
name_format
instance-attribute
name_format: NameFormatSchema
__init__
__init__(
group_name: str,
member_count: int,
available_sizes: list[str],
available_quants: list[str],
common_baseline: str | None,
any_nsfw: bool,
any_has_description: bool,
merged_tags: list[str],
name_format: NameFormatSchema,
) -> None
parse_text_model_name
cached
parse_text_model_name(
model_name: str,
) -> ParsedTextModelName
Parse a text model name into structured components.
Attempts to extract base name, size, variant, and quantization information
from a model name using regex patterns.
Parameters:
-
model_name
(str)
–
The full model name to parse.
Returns:
Example
parsed = parse_text_model_name("Llama-3-8B-Instruct-Q4_K_M")
print(parsed.base_name) # "Llama-3"
print(parsed.size) # "8B"
print(parsed.variant) # "Instruct"
print(parsed.quant) # "Q4_K_M"
Source code in src/horde_model_reference/analytics/text_model_parser.py
| @lru_cache(maxsize=2048)
def parse_text_model_name(model_name: str) -> ParsedTextModelName:
"""Parse a text model name into structured components.
Attempts to extract base name, size, variant, and quantization information
from a model name using regex patterns.
Args:
model_name: The full model name to parse.
Returns:
ParsedTextModelName with extracted components.
Example:
>>> parsed = parse_text_model_name("Llama-3-8B-Instruct-Q4_K_M")
>>> print(parsed.base_name) # "Llama-3"
>>> print(parsed.size) # "8B"
>>> print(parsed.variant) # "Instruct"
>>> print(parsed.quant) # "Q4_K_M"
"""
logger.trace(f"Parsing text model name: {model_name}")
name_parts = model_name
size = None
variant = None
quant = None
version = None
# Extract size
for pattern in SIZE_PATTERNS:
match = re.search(pattern, name_parts, re.IGNORECASE)
if match:
size = match.group(1).upper()
name_parts = name_parts[: match.start()] + name_parts[match.end() :]
logger.trace(f"Extracted size: {size}")
break
# Extract version (after size so v-prefixed versions aren't confused with sizes)
for pattern in VERSION_PATTERNS:
match = re.search(pattern, name_parts, re.IGNORECASE)
if match:
version = match.group(1)
name_parts = name_parts[: match.start()] + name_parts[match.end() :]
logger.trace(f"Extracted version: {version}")
break
# Extract quantization
for pattern in QUANT_PATTERNS:
match = re.search(pattern, name_parts, re.IGNORECASE)
if match:
quant = match.group(1).upper()
name_parts = name_parts[: match.start()] + name_parts[match.end() :]
logger.trace(f"Extracted quant: {quant}")
break
# Extract variant
for pattern in VARIANT_PATTERNS:
match = re.search(pattern, name_parts, re.IGNORECASE)
if match:
variant = match.group(1)
name_parts = name_parts[: match.start()] + name_parts[match.end() :]
logger.trace(f"Extracted variant: {variant}")
break
# Clean up base name — collapse repeated separators and strip edges
base_name = name_parts
for sep in SEPARATORS:
while sep + sep in base_name:
base_name = base_name.replace(sep + sep, sep)
base_name = base_name.strip("-_ .")
if not base_name:
base_name = model_name
logger.debug(f"Could not extract base name, using original: {base_name}")
else:
logger.trace(f"Extracted base name: {base_name}")
normalized = normalize_model_name(model_name)
return ParsedTextModelName(
original_name=model_name,
base_name=base_name,
size=size,
variant=variant,
quant=quant,
version=version,
normalized_name=normalized,
)
|
get_base_model_name
cached
get_base_model_name(model_name: str) -> str
Get the base model name for grouping purposes.
Extracts just the base name without backend prefix, author prefix,
size, variant, or quantization info. Useful for grouping different
variants of the same model together.
Parameters:
-
model_name
(str)
–
The full model name (may include backend and author prefixes).
Returns:
-
str
–
The base model name without prefixes.
Example
get_base_model_name("Llama-3-8B-Instruct-Q4_K_M")
"Llama-3"
get_base_model_name("Mistral-7B-v0.1")
"Mistral"
get_base_model_name("koboldcpp/sophosympatheia/StrawberryLemonade-L3-70B-v1.2")
"StrawberryLemonade-L3-v1"
get_base_model_name("aphrodite/ReadyArt/Broken-Tutu-24B")
"Broken-Tutu"
Source code in src/horde_model_reference/analytics/text_model_parser.py
| @lru_cache(maxsize=2048)
def get_base_model_name(model_name: str) -> str:
"""Get the base model name for grouping purposes.
Extracts just the base name without backend prefix, author prefix,
size, variant, or quantization info. Useful for grouping different
variants of the same model together.
Args:
model_name: The full model name (may include backend and author prefixes).
Returns:
The base model name without prefixes.
Example:
>>> get_base_model_name("Llama-3-8B-Instruct-Q4_K_M")
"Llama-3"
>>> get_base_model_name("Mistral-7B-v0.1")
"Mistral"
>>> get_base_model_name("koboldcpp/sophosympatheia/StrawberryLemonade-L3-70B-v1.2")
"StrawberryLemonade-L3-v1"
>>> get_base_model_name("aphrodite/ReadyArt/Broken-Tutu-24B")
"Broken-Tutu"
"""
from horde_model_reference.text_backend_names import strip_backend_prefix
# First strip backend prefix (e.g., "koboldcpp/", "aphrodite/")
name_without_backend = strip_backend_prefix(model_name)
# Then strip author prefix if present (e.g., "sophosympatheia/", "ReadyArt/")
# Author prefix is the first part before "/" if there's one remaining
if "/" in name_without_backend:
name_without_author = name_without_backend.split("/", 1)[1]
else:
name_without_author = name_without_backend
parsed = parse_text_model_name(name_without_author)
return parsed.base_name
|
normalize_model_name
cached
normalize_model_name(model_name: str) -> str
Normalize a model name for case-insensitive comparison.
Converts to lowercase and normalizes separators.
Parameters:
-
model_name
(str)
–
The model name to normalize.
Returns:
Example
normalize_model_name("Llama-3-8B-Instruct")
"llama_3_8b_instruct"
Source code in src/horde_model_reference/analytics/text_model_parser.py
| @lru_cache(maxsize=2048)
def normalize_model_name(model_name: str) -> str:
"""Normalize a model name for case-insensitive comparison.
Converts to lowercase and normalizes separators.
Args:
model_name: The model name to normalize.
Returns:
Normalized model name.
Example:
>>> normalize_model_name("Llama-3-8B-Instruct")
"llama_3_8b_instruct"
"""
normalized = model_name.lower()
for sep in ["-", " ", "."]:
normalized = normalized.replace(sep, "_")
normalized = re.sub(r"_+", "_", normalized)
return normalized.strip("_")
|
group_text_models_by_base
group_text_models_by_base(
model_names: list[str],
) -> dict[str, TextModelGroup]
Group text model names by their base model.
Groups variants of the same model together based on extracted base names.
Parameters:
-
model_names
(list[str])
–
List of model names to group.
Returns:
Example
models = [
... "Llama-3-8B-Instruct",
... "Llama-3-8B-Instruct-Q4",
... "Llama-3-70B-Instruct",
... "Mistral-7B-v0.1",
... ]
grouped = group_text_models_by_base(models)
print(grouped)
{
"Llama-3": ["Llama-3-8B-Instruct", "Llama-3-8B-Instruct-Q4", "Llama-3-70B-Instruct"],
"Mistral": ["Mistral-7B-v0.1"]
}
Source code in src/horde_model_reference/analytics/text_model_parser.py
| def group_text_models_by_base(
model_names: list[str],
) -> dict[str, TextModelGroup]:
"""Group text model names by their base model.
Groups variants of the same model together based on extracted base names.
Args:
model_names: List of model names to group.
Returns:
Dictionary mapping base names to lists of full model names.
Example:
>>> models = [
... "Llama-3-8B-Instruct",
... "Llama-3-8B-Instruct-Q4",
... "Llama-3-70B-Instruct",
... "Mistral-7B-v0.1",
... ]
>>> grouped = group_text_models_by_base(models)
>>> print(grouped)
{
"Llama-3": ["Llama-3-8B-Instruct", "Llama-3-8B-Instruct-Q4", "Llama-3-70B-Instruct"],
"Mistral": ["Mistral-7B-v0.1"]
}
"""
grouped: dict[str, list[str]] = {}
for model_name in model_names:
base_name = get_base_model_name(model_name)
if base_name not in grouped:
grouped[base_name] = []
grouped[base_name].append(model_name)
logger.debug(f"Grouped {len(model_names)} models into {len(grouped)} base models")
return {
base_name: TextModelGroup(
base_name=base_name,
variants=variants,
)
for base_name, variants in grouped.items()
}
|
is_quantized_variant
cached
is_quantized_variant(model_name: str) -> bool
Check if a model name indicates a quantized variant.
Parameters:
Returns:
-
bool
–
True if the model appears to be a quantized variant.
Example
is_quantized_variant("Llama-3-8B-Instruct-Q4_K_M")
True
is_quantized_variant("Llama-3-8B-Instruct")
False
Source code in src/horde_model_reference/analytics/text_model_parser.py
| @lru_cache(maxsize=2048)
def is_quantized_variant(model_name: str) -> bool:
"""Check if a model name indicates a quantized variant.
Args:
model_name: The model name to check.
Returns:
True if the model appears to be a quantized variant.
Example:
>>> is_quantized_variant("Llama-3-8B-Instruct-Q4_K_M")
True
>>> is_quantized_variant("Llama-3-8B-Instruct")
False
"""
parsed = parse_text_model_name(model_name)
return parsed.quant is not None
|
get_model_size
cached
get_model_size(model_name: str) -> str | None
Extract the model size from a model name.
Parameters:
Returns:
-
str | None
–
The model size (e.g., "7B", "13B") or None if not found.
Example
get_model_size("Llama-3-8B-Instruct")
"8B"
get_model_size("GPT-4")
None
Source code in src/horde_model_reference/analytics/text_model_parser.py
| @lru_cache(maxsize=2048)
def get_model_size(model_name: str) -> str | None:
"""Extract the model size from a model name.
Args:
model_name: The model name to parse.
Returns:
The model size (e.g., "7B", "13B") or None if not found.
Example:
>>> get_model_size("Llama-3-8B-Instruct")
"8B"
>>> get_model_size("GPT-4")
None
"""
parsed = parse_text_model_name(model_name)
return parsed.size
|
get_model_variant
cached
get_model_variant(model_name: str) -> str | None
Extract the model variant from a model name.
Parameters:
Returns:
-
str | None
–
The model variant (e.g., "Instruct", "Chat") or None if not found.
Example
get_model_variant("Llama-3-8B-Instruct")
"Instruct"
get_model_variant("Llama-3-8B")
None
Source code in src/horde_model_reference/analytics/text_model_parser.py
| @lru_cache(maxsize=2048)
def get_model_variant(model_name: str) -> str | None:
"""Extract the model variant from a model name.
Args:
model_name: The model name to parse.
Returns:
The model variant (e.g., "Instruct", "Chat") or None if not found.
Example:
>>> get_model_variant("Llama-3-8B-Instruct")
"Instruct"
>>> get_model_variant("Llama-3-8B")
None
"""
parsed = parse_text_model_name(model_name)
return parsed.variant
|
_detect_separator
_detect_separator(names: list[str]) -> str
Detect the dominant separator in model names (ignoring separators within quant tokens).
Source code in src/horde_model_reference/analytics/text_model_parser.py
| def _detect_separator(names: list[str]) -> str:
"""Detect the dominant separator in model names (ignoring separators within quant tokens)."""
hyphen_count = 0
underscore_count = 0
for name in names:
cleaned = name
for pattern in QUANT_PATTERNS:
cleaned = re.sub(pattern, "", cleaned, flags=re.IGNORECASE)
hyphen_count += cleaned.count("-")
underscore_count += cleaned.count("_")
return "_" if underscore_count > hyphen_count else "-"
|
_detect_part_order
_detect_part_order(
original: str, parsed: ParsedTextModelName
) -> list[str]
Detect the order of parts in a model name by their position in the original string.
Source code in src/horde_model_reference/analytics/text_model_parser.py
| def _detect_part_order(original: str, parsed: ParsedTextModelName) -> list[str]:
"""Detect the order of parts in a model name by their position in the original string."""
parts: dict[str, str] = {}
if parsed.base_name:
parts["base"] = parsed.base_name
if parsed.size:
parts["size"] = parsed.size
if parsed.variant:
parts["variant"] = parsed.variant
if parsed.version:
parts["version"] = parsed.version
if parsed.quant:
parts["quant"] = parsed.quant
positions: dict[str, int] = {}
original_lower = original.lower()
for part_name, part_value in parts.items():
pos = original_lower.find(part_value.lower())
if pos >= 0:
positions[part_name] = pos
return [name for name, _ in sorted(positions.items(), key=lambda x: x[1])]
|
infer_name_format
infer_name_format(
member_names: list[str],
) -> NameFormatSchema
Infer the naming convention from existing group members.
Analyzes separators, part ordering, and author inclusion across
all member names to produce a schema that can drive consistent
name composition for new variations.
Parameters:
-
member_names
(list[str])
–
List of model names belonging to the same group.
Returns:
Source code in src/horde_model_reference/analytics/text_model_parser.py
| def infer_name_format(member_names: list[str]) -> NameFormatSchema:
"""Infer the naming convention from existing group members.
Analyzes separators, part ordering, and author inclusion across
all member names to produce a schema that can drive consistent
name composition for new variations.
Args:
member_names: List of model names belonging to the same group.
Returns:
NameFormatSchema describing the group's naming convention.
"""
if not member_names:
return NameFormatSchema()
# Separate author prefixes
authors: set[str] = set()
names_without_author: list[str] = []
for name in member_names:
if "/" in name:
author, _, rest = name.partition("/")
authors.add(author)
names_without_author.append(rest)
else:
names_without_author.append(name)
author_included = len(authors) > 0
common_author = authors.pop() if len(authors) == 1 else None
separator = _detect_separator(names_without_author)
# Detect part order from the most-complete member (most extracted parts)
parsed_members = [parse_text_model_name(n) for n in names_without_author]
richest = max(
zip(names_without_author, parsed_members, strict=False),
key=lambda pair: sum(1 for v in [pair[1].size, pair[1].variant, pair[1].version, pair[1].quant] if v),
)
part_order = _detect_part_order(richest[0], richest[1])
# Build human-readable template
template_parts: list[str] = []
if author_included:
template_parts.append("{author}/")
for i, part in enumerate(part_order):
if i == 0:
template_parts.append(f"{{{part}}}")
else:
template_parts.append(f"{separator}{{{part}}}")
template = "".join(template_parts)
return NameFormatSchema(
separator=separator,
part_order=part_order,
author_included=author_included,
common_author=common_author,
template=template,
)
|
compute_group_summaries
compute_group_summaries(
models_dict: dict[str, dict[str, object]],
) -> dict[str, TextModelGroupSummary]
Compute aggregated summaries for each text model group.
Expects models_dict entries to already have text_model_group set.
Parses each model name to extract sizes, quants, etc. and aggregates
metadata fields (baseline, nsfw, tags, description) across members.
Parameters:
-
models_dict
(dict[str, dict[str, object]])
–
Mapping of model_name → model_data dicts (mutated legacy JSON).
Returns:
Source code in src/horde_model_reference/analytics/text_model_parser.py
| def compute_group_summaries(
models_dict: dict[str, dict[str, object]],
) -> dict[str, TextModelGroupSummary]:
"""Compute aggregated summaries for each text model group.
Expects models_dict entries to already have ``text_model_group`` set.
Parses each model name to extract sizes, quants, etc. and aggregates
metadata fields (baseline, nsfw, tags, description) across members.
Args:
models_dict: Mapping of model_name → model_data dicts (mutated legacy JSON).
Returns:
Mapping of group_name → TextModelGroupSummary.
"""
# Group model names by their text_model_group value
groups: dict[str, list[str]] = {}
for model_name, model_data in models_dict.items():
group = str(model_data.get("text_model_group", model_name))
if group not in groups:
groups[group] = []
groups[group].append(model_name)
summaries: dict[str, TextModelGroupSummary] = {}
for group_name, member_names in groups.items():
parsed = [parse_text_model_name(name) for name in member_names]
sizes: set[str] = set()
quants: set[str] = set()
baselines: set[str] = set()
any_nsfw = False
any_has_description = False
merged_tags: set[str] = set()
for p, mname in zip(parsed, member_names, strict=False):
mdata = models_dict[mname]
if p.size:
sizes.add(p.size)
if p.quant:
quants.add(p.quant)
baseline = mdata.get("baseline")
if baseline:
baselines.add(str(baseline))
if mdata.get("nsfw"):
any_nsfw = True
if mdata.get("description"):
any_has_description = True
tags = mdata.get("tags")
if isinstance(tags, list):
merged_tags.update(str(t) for t in tags)
format_schema = infer_name_format(member_names)
summaries[group_name] = TextModelGroupSummary(
group_name=group_name,
member_count=len(member_names),
available_sizes=sorted(sizes),
available_quants=sorted(quants),
common_baseline=baselines.pop() if len(baselines) == 1 else None,
any_nsfw=any_nsfw,
any_has_description=any_has_description,
merged_tags=sorted(merged_tags),
name_format=format_schema,
)
return summaries
|