Skip to content

text_model_parser

Text model name parser for grouping and analysis.

Provides functions to parse text generation model names into structured components like base name, size, variant, and quantization. Useful for grouping model variants (e.g., different quant versions of the same base model).

SIZE_PATTERNS module-attribute

SIZE_PATTERNS = [
    "(?<![a-zA-Z])(\\d+\\.?\\d*[BMK]\\d*)(?![a-zA-Z])",
    "(?<![a-zA-Z])(\\d+x\\d+[BMK])(?![a-zA-Z])",
]

VERSION_PATTERNS module-attribute

VERSION_PATTERNS = [
    "(?<![a-zA-Z0-9])([vV]\\d+(?:\\.\\d+)+)(?![a-zA-Z0-9])",
    "(?<![a-zA-Z0-9])([vV]\\d+)(?![a-zA-Z0-9.])",
]

VARIANT_PATTERNS module-attribute

VARIANT_PATTERNS = [
    "\\b(Instruct|Chat|Code|Base|Uncensored|Finetune|FT)\\b",
    "\\b(turbo|preview|latest)\\b",
]

QUANT_PATTERNS module-attribute

QUANT_PATTERNS = [
    "\\b(Q[2-8]_K(?:_[SMLH])?)\\b",
    "\\b(Q[2-8]_[01])\\b",
    "\\b(Q[2-8])\\b",
    "\\b(GGUF|GGML|GPTQ|AWQ|EXL2)\\b",
    "\\b(fp16|fp32|int8|int4)\\b",
]

SEPARATORS module-attribute

SEPARATORS = ['-', '_', ' ', '.']

ParsedTextModelName dataclass

Structured representation of a parsed text model name.

Attributes:

  • original_name (str) –

    The original model name as provided.

  • base_name (str) –

    The base model name without size/variant/quant/version info.

  • size (str | None) –

    Model size if detected (e.g., "7B", "13B", "70B", "7B1").

  • variant (str | None) –

    Model variant if detected (e.g., "Instruct", "Chat", "Code").

  • quant (str | None) –

    Quantization type if detected (e.g., "Q4", "Q8", "GGUF").

  • version (str | None) –

    Model version if detected (e.g., "v0.1", "v2.1").

  • normalized_name (str | None) –

    A normalized version of the name for comparison.

Source code in src/horde_model_reference/analytics/text_model_parser.py
@dataclass
class ParsedTextModelName:
    """Structured representation of a parsed text model name.

    Attributes:
        original_name: The original model name as provided.
        base_name: The base model name without size/variant/quant/version info.
        size: Model size if detected (e.g., "7B", "13B", "70B", "7B1").
        variant: Model variant if detected (e.g., "Instruct", "Chat", "Code").
        quant: Quantization type if detected (e.g., "Q4", "Q8", "GGUF").
        version: Model version if detected (e.g., "v0.1", "v2.1").
        normalized_name: A normalized version of the name for comparison.

    """

    original_name: str
    base_name: str
    size: str | None = None
    variant: str | None = None
    quant: str | None = None
    version: str | None = None
    normalized_name: str | None = None

original_name instance-attribute

original_name: str

base_name instance-attribute

base_name: str

size class-attribute instance-attribute

size: str | None = None

variant class-attribute instance-attribute

variant: str | None = None

quant class-attribute instance-attribute

quant: str | None = None

version class-attribute instance-attribute

version: str | None = None

normalized_name class-attribute instance-attribute

normalized_name: str | None = None

__init__

__init__(
    original_name: str,
    base_name: str,
    size: str | None = None,
    variant: str | None = None,
    quant: str | None = None,
    version: str | None = None,
    normalized_name: str | None = None,
) -> None

TextModelGroup dataclass

Represents a group of text model variants sharing the same base model.

Attributes:

Source code in src/horde_model_reference/analytics/text_model_parser.py
@dataclass
class TextModelGroup:
    """Represents a group of text model variants sharing the same base model.

    Attributes:
        base_name: The base model name.
        variants: List of full model names that are variants of the base model.

    """

    base_name: str
    variants: list[str]

base_name instance-attribute

base_name: str

variants instance-attribute

variants: list[str]

__init__

__init__(base_name: str, variants: list[str]) -> None

NameFormatSchema dataclass

Describes the naming convention inferred from a group of models.

Used by compose_name to produce names consistent with existing group members.

Source code in src/horde_model_reference/analytics/text_model_parser.py
@dataclass
class NameFormatSchema:
    """Describes the naming convention inferred from a group of models.

    Used by compose_name to produce names consistent with existing group members.
    """

    separator: str = "-"
    part_order: list[str] = field(default_factory=lambda: ["base", "size", "variant", "version", "quant"])
    author_included: bool = False
    common_author: str | None = None
    template: str = "{base}-{size}"

separator class-attribute instance-attribute

separator: str = '-'

part_order class-attribute instance-attribute

part_order: list[str] = field(
    default_factory=lambda: [
        "base",
        "size",
        "variant",
        "version",
        "quant",
    ]
)

author_included class-attribute instance-attribute

author_included: bool = False

common_author class-attribute instance-attribute

common_author: str | None = None

template class-attribute instance-attribute

template: str = '{base}-{size}'

__init__

__init__(
    separator: str = "-",
    part_order: list[str] = (
        lambda: [
            "base",
            "size",
            "variant",
            "version",
            "quant",
        ]
    )(),
    author_included: bool = False,
    common_author: str | None = None,
    template: str = "{base}-{size}",
) -> None

TextModelGroupSummary dataclass

Aggregated metadata for a group of text model variants.

Source code in src/horde_model_reference/analytics/text_model_parser.py
@dataclass
class TextModelGroupSummary:
    """Aggregated metadata for a group of text model variants."""

    group_name: str
    member_count: int
    available_sizes: list[str]
    available_quants: list[str]
    common_baseline: str | None
    any_nsfw: bool
    any_has_description: bool
    merged_tags: list[str]
    name_format: NameFormatSchema

group_name instance-attribute

group_name: str

member_count instance-attribute

member_count: int

available_sizes instance-attribute

available_sizes: list[str]

available_quants instance-attribute

available_quants: list[str]

common_baseline instance-attribute

common_baseline: str | None

any_nsfw instance-attribute

any_nsfw: bool

any_has_description instance-attribute

any_has_description: bool

merged_tags instance-attribute

merged_tags: list[str]

name_format instance-attribute

name_format: NameFormatSchema

__init__

__init__(
    group_name: str,
    member_count: int,
    available_sizes: list[str],
    available_quants: list[str],
    common_baseline: str | None,
    any_nsfw: bool,
    any_has_description: bool,
    merged_tags: list[str],
    name_format: NameFormatSchema,
) -> None

parse_text_model_name cached

parse_text_model_name(
    model_name: str,
) -> ParsedTextModelName

Parse a text model name into structured components.

Attempts to extract base name, size, variant, and quantization information from a model name using regex patterns.

Parameters:

  • model_name (str) –

    The full model name to parse.

Returns:

Example

parsed = parse_text_model_name("Llama-3-8B-Instruct-Q4_K_M") print(parsed.base_name) # "Llama-3" print(parsed.size) # "8B" print(parsed.variant) # "Instruct" print(parsed.quant) # "Q4_K_M"

Source code in src/horde_model_reference/analytics/text_model_parser.py
@lru_cache(maxsize=2048)
def parse_text_model_name(model_name: str) -> ParsedTextModelName:
    """Parse a text model name into structured components.

    Attempts to extract base name, size, variant, and quantization information
    from a model name using regex patterns.

    Args:
        model_name: The full model name to parse.

    Returns:
        ParsedTextModelName with extracted components.

    Example:
        >>> parsed = parse_text_model_name("Llama-3-8B-Instruct-Q4_K_M")
        >>> print(parsed.base_name)  # "Llama-3"
        >>> print(parsed.size)  # "8B"
        >>> print(parsed.variant)  # "Instruct"
        >>> print(parsed.quant)  # "Q4_K_M"

    """
    logger.trace(f"Parsing text model name: {model_name}")

    name_parts = model_name
    size = None
    variant = None
    quant = None
    version = None

    # Extract size
    for pattern in SIZE_PATTERNS:
        match = re.search(pattern, name_parts, re.IGNORECASE)
        if match:
            size = match.group(1).upper()
            name_parts = name_parts[: match.start()] + name_parts[match.end() :]
            logger.trace(f"Extracted size: {size}")
            break

    # Extract version (after size so v-prefixed versions aren't confused with sizes)
    for pattern in VERSION_PATTERNS:
        match = re.search(pattern, name_parts, re.IGNORECASE)
        if match:
            version = match.group(1)
            name_parts = name_parts[: match.start()] + name_parts[match.end() :]
            logger.trace(f"Extracted version: {version}")
            break

    # Extract quantization
    for pattern in QUANT_PATTERNS:
        match = re.search(pattern, name_parts, re.IGNORECASE)
        if match:
            quant = match.group(1).upper()
            name_parts = name_parts[: match.start()] + name_parts[match.end() :]
            logger.trace(f"Extracted quant: {quant}")
            break

    # Extract variant
    for pattern in VARIANT_PATTERNS:
        match = re.search(pattern, name_parts, re.IGNORECASE)
        if match:
            variant = match.group(1)
            name_parts = name_parts[: match.start()] + name_parts[match.end() :]
            logger.trace(f"Extracted variant: {variant}")
            break

    # Clean up base name — collapse repeated separators and strip edges
    base_name = name_parts
    for sep in SEPARATORS:
        while sep + sep in base_name:
            base_name = base_name.replace(sep + sep, sep)

    base_name = base_name.strip("-_ .")

    if not base_name:
        base_name = model_name
        logger.debug(f"Could not extract base name, using original: {base_name}")
    else:
        logger.trace(f"Extracted base name: {base_name}")

    normalized = normalize_model_name(model_name)

    return ParsedTextModelName(
        original_name=model_name,
        base_name=base_name,
        size=size,
        variant=variant,
        quant=quant,
        version=version,
        normalized_name=normalized,
    )

get_base_model_name cached

get_base_model_name(model_name: str) -> str

Get the base model name for grouping purposes.

Extracts just the base name without backend prefix, author prefix, size, variant, or quantization info. Useful for grouping different variants of the same model together.

Parameters:

  • model_name (str) –

    The full model name (may include backend and author prefixes).

Returns:

  • str

    The base model name without prefixes.

Example

get_base_model_name("Llama-3-8B-Instruct-Q4_K_M") "Llama-3" get_base_model_name("Mistral-7B-v0.1") "Mistral" get_base_model_name("koboldcpp/sophosympatheia/StrawberryLemonade-L3-70B-v1.2") "StrawberryLemonade-L3-v1" get_base_model_name("aphrodite/ReadyArt/Broken-Tutu-24B") "Broken-Tutu"

Source code in src/horde_model_reference/analytics/text_model_parser.py
@lru_cache(maxsize=2048)
def get_base_model_name(model_name: str) -> str:
    """Get the base model name for grouping purposes.

    Extracts just the base name without backend prefix, author prefix,
    size, variant, or quantization info. Useful for grouping different
    variants of the same model together.

    Args:
        model_name: The full model name (may include backend and author prefixes).

    Returns:
        The base model name without prefixes.

    Example:
        >>> get_base_model_name("Llama-3-8B-Instruct-Q4_K_M")
        "Llama-3"
        >>> get_base_model_name("Mistral-7B-v0.1")
        "Mistral"
        >>> get_base_model_name("koboldcpp/sophosympatheia/StrawberryLemonade-L3-70B-v1.2")
        "StrawberryLemonade-L3-v1"
        >>> get_base_model_name("aphrodite/ReadyArt/Broken-Tutu-24B")
        "Broken-Tutu"

    """
    from horde_model_reference.text_backend_names import strip_backend_prefix

    # First strip backend prefix (e.g., "koboldcpp/", "aphrodite/")
    name_without_backend = strip_backend_prefix(model_name)

    # Then strip author prefix if present (e.g., "sophosympatheia/", "ReadyArt/")
    # Author prefix is the first part before "/" if there's one remaining
    if "/" in name_without_backend:
        name_without_author = name_without_backend.split("/", 1)[1]
    else:
        name_without_author = name_without_backend

    parsed = parse_text_model_name(name_without_author)
    return parsed.base_name

normalize_model_name cached

normalize_model_name(model_name: str) -> str

Normalize a model name for case-insensitive comparison.

Converts to lowercase and normalizes separators.

Parameters:

  • model_name (str) –

    The model name to normalize.

Returns:

  • str

    Normalized model name.

Example

normalize_model_name("Llama-3-8B-Instruct") "llama_3_8b_instruct"

Source code in src/horde_model_reference/analytics/text_model_parser.py
@lru_cache(maxsize=2048)
def normalize_model_name(model_name: str) -> str:
    """Normalize a model name for case-insensitive comparison.

    Converts to lowercase and normalizes separators.

    Args:
        model_name: The model name to normalize.

    Returns:
        Normalized model name.

    Example:
        >>> normalize_model_name("Llama-3-8B-Instruct")
        "llama_3_8b_instruct"

    """
    normalized = model_name.lower()

    for sep in ["-", " ", "."]:
        normalized = normalized.replace(sep, "_")

    normalized = re.sub(r"_+", "_", normalized)
    return normalized.strip("_")

group_text_models_by_base

group_text_models_by_base(
    model_names: list[str],
) -> dict[str, TextModelGroup]

Group text model names by their base model.

Groups variants of the same model together based on extracted base names.

Parameters:

  • model_names (list[str]) –

    List of model names to group.

Returns:

Example

models = [ ... "Llama-3-8B-Instruct", ... "Llama-3-8B-Instruct-Q4", ... "Llama-3-70B-Instruct", ... "Mistral-7B-v0.1", ... ] grouped = group_text_models_by_base(models) print(grouped) { "Llama-3": ["Llama-3-8B-Instruct", "Llama-3-8B-Instruct-Q4", "Llama-3-70B-Instruct"], "Mistral": ["Mistral-7B-v0.1"] }

Source code in src/horde_model_reference/analytics/text_model_parser.py
def group_text_models_by_base(
    model_names: list[str],
) -> dict[str, TextModelGroup]:
    """Group text model names by their base model.

    Groups variants of the same model together based on extracted base names.

    Args:
        model_names: List of model names to group.

    Returns:
        Dictionary mapping base names to lists of full model names.

    Example:
        >>> models = [
        ...     "Llama-3-8B-Instruct",
        ...     "Llama-3-8B-Instruct-Q4",
        ...     "Llama-3-70B-Instruct",
        ...     "Mistral-7B-v0.1",
        ... ]
        >>> grouped = group_text_models_by_base(models)
        >>> print(grouped)
        {
            "Llama-3": ["Llama-3-8B-Instruct", "Llama-3-8B-Instruct-Q4", "Llama-3-70B-Instruct"],
            "Mistral": ["Mistral-7B-v0.1"]
        }

    """
    grouped: dict[str, list[str]] = {}

    for model_name in model_names:
        base_name = get_base_model_name(model_name)

        if base_name not in grouped:
            grouped[base_name] = []

        grouped[base_name].append(model_name)

    logger.debug(f"Grouped {len(model_names)} models into {len(grouped)} base models")

    return {
        base_name: TextModelGroup(
            base_name=base_name,
            variants=variants,
        )
        for base_name, variants in grouped.items()
    }

is_quantized_variant cached

is_quantized_variant(model_name: str) -> bool

Check if a model name indicates a quantized variant.

Parameters:

  • model_name (str) –

    The model name to check.

Returns:

  • bool

    True if the model appears to be a quantized variant.

Example

is_quantized_variant("Llama-3-8B-Instruct-Q4_K_M") True is_quantized_variant("Llama-3-8B-Instruct") False

Source code in src/horde_model_reference/analytics/text_model_parser.py
@lru_cache(maxsize=2048)
def is_quantized_variant(model_name: str) -> bool:
    """Check if a model name indicates a quantized variant.

    Args:
        model_name: The model name to check.

    Returns:
        True if the model appears to be a quantized variant.

    Example:
        >>> is_quantized_variant("Llama-3-8B-Instruct-Q4_K_M")
        True
        >>> is_quantized_variant("Llama-3-8B-Instruct")
        False

    """
    parsed = parse_text_model_name(model_name)
    return parsed.quant is not None

get_model_size cached

get_model_size(model_name: str) -> str | None

Extract the model size from a model name.

Parameters:

  • model_name (str) –

    The model name to parse.

Returns:

  • str | None

    The model size (e.g., "7B", "13B") or None if not found.

Example

get_model_size("Llama-3-8B-Instruct") "8B" get_model_size("GPT-4") None

Source code in src/horde_model_reference/analytics/text_model_parser.py
@lru_cache(maxsize=2048)
def get_model_size(model_name: str) -> str | None:
    """Extract the model size from a model name.

    Args:
        model_name: The model name to parse.

    Returns:
        The model size (e.g., "7B", "13B") or None if not found.

    Example:
        >>> get_model_size("Llama-3-8B-Instruct")
        "8B"
        >>> get_model_size("GPT-4")
        None

    """
    parsed = parse_text_model_name(model_name)
    return parsed.size

get_model_variant cached

get_model_variant(model_name: str) -> str | None

Extract the model variant from a model name.

Parameters:

  • model_name (str) –

    The model name to parse.

Returns:

  • str | None

    The model variant (e.g., "Instruct", "Chat") or None if not found.

Example

get_model_variant("Llama-3-8B-Instruct") "Instruct" get_model_variant("Llama-3-8B") None

Source code in src/horde_model_reference/analytics/text_model_parser.py
@lru_cache(maxsize=2048)
def get_model_variant(model_name: str) -> str | None:
    """Extract the model variant from a model name.

    Args:
        model_name: The model name to parse.

    Returns:
        The model variant (e.g., "Instruct", "Chat") or None if not found.

    Example:
        >>> get_model_variant("Llama-3-8B-Instruct")
        "Instruct"
        >>> get_model_variant("Llama-3-8B")
        None

    """
    parsed = parse_text_model_name(model_name)
    return parsed.variant

_detect_separator

_detect_separator(names: list[str]) -> str

Detect the dominant separator in model names (ignoring separators within quant tokens).

Source code in src/horde_model_reference/analytics/text_model_parser.py
def _detect_separator(names: list[str]) -> str:
    """Detect the dominant separator in model names (ignoring separators within quant tokens)."""
    hyphen_count = 0
    underscore_count = 0

    for name in names:
        cleaned = name
        for pattern in QUANT_PATTERNS:
            cleaned = re.sub(pattern, "", cleaned, flags=re.IGNORECASE)

        hyphen_count += cleaned.count("-")
        underscore_count += cleaned.count("_")

    return "_" if underscore_count > hyphen_count else "-"

_detect_part_order

_detect_part_order(
    original: str, parsed: ParsedTextModelName
) -> list[str]

Detect the order of parts in a model name by their position in the original string.

Source code in src/horde_model_reference/analytics/text_model_parser.py
def _detect_part_order(original: str, parsed: ParsedTextModelName) -> list[str]:
    """Detect the order of parts in a model name by their position in the original string."""
    parts: dict[str, str] = {}
    if parsed.base_name:
        parts["base"] = parsed.base_name
    if parsed.size:
        parts["size"] = parsed.size
    if parsed.variant:
        parts["variant"] = parsed.variant
    if parsed.version:
        parts["version"] = parsed.version
    if parsed.quant:
        parts["quant"] = parsed.quant

    positions: dict[str, int] = {}
    original_lower = original.lower()
    for part_name, part_value in parts.items():
        pos = original_lower.find(part_value.lower())
        if pos >= 0:
            positions[part_name] = pos

    return [name for name, _ in sorted(positions.items(), key=lambda x: x[1])]

infer_name_format

infer_name_format(
    member_names: list[str],
) -> NameFormatSchema

Infer the naming convention from existing group members.

Analyzes separators, part ordering, and author inclusion across all member names to produce a schema that can drive consistent name composition for new variations.

Parameters:

  • member_names (list[str]) –

    List of model names belonging to the same group.

Returns:

  • NameFormatSchema

    NameFormatSchema describing the group's naming convention.

Source code in src/horde_model_reference/analytics/text_model_parser.py
def infer_name_format(member_names: list[str]) -> NameFormatSchema:
    """Infer the naming convention from existing group members.

    Analyzes separators, part ordering, and author inclusion across
    all member names to produce a schema that can drive consistent
    name composition for new variations.

    Args:
        member_names: List of model names belonging to the same group.

    Returns:
        NameFormatSchema describing the group's naming convention.

    """
    if not member_names:
        return NameFormatSchema()

    # Separate author prefixes
    authors: set[str] = set()
    names_without_author: list[str] = []
    for name in member_names:
        if "/" in name:
            author, _, rest = name.partition("/")
            authors.add(author)
            names_without_author.append(rest)
        else:
            names_without_author.append(name)

    author_included = len(authors) > 0
    common_author = authors.pop() if len(authors) == 1 else None

    separator = _detect_separator(names_without_author)

    # Detect part order from the most-complete member (most extracted parts)
    parsed_members = [parse_text_model_name(n) for n in names_without_author]
    richest = max(
        zip(names_without_author, parsed_members, strict=False),
        key=lambda pair: sum(1 for v in [pair[1].size, pair[1].variant, pair[1].version, pair[1].quant] if v),
    )
    part_order = _detect_part_order(richest[0], richest[1])

    # Build human-readable template
    template_parts: list[str] = []
    if author_included:
        template_parts.append("{author}/")
    for i, part in enumerate(part_order):
        if i == 0:
            template_parts.append(f"{{{part}}}")
        else:
            template_parts.append(f"{separator}{{{part}}}")
    template = "".join(template_parts)

    return NameFormatSchema(
        separator=separator,
        part_order=part_order,
        author_included=author_included,
        common_author=common_author,
        template=template,
    )

compute_group_summaries

compute_group_summaries(
    models_dict: dict[str, dict[str, object]],
) -> dict[str, TextModelGroupSummary]

Compute aggregated summaries for each text model group.

Expects models_dict entries to already have text_model_group set. Parses each model name to extract sizes, quants, etc. and aggregates metadata fields (baseline, nsfw, tags, description) across members.

Parameters:

  • models_dict (dict[str, dict[str, object]]) –

    Mapping of model_name → model_data dicts (mutated legacy JSON).

Returns:

Source code in src/horde_model_reference/analytics/text_model_parser.py
def compute_group_summaries(
    models_dict: dict[str, dict[str, object]],
) -> dict[str, TextModelGroupSummary]:
    """Compute aggregated summaries for each text model group.

    Expects models_dict entries to already have ``text_model_group`` set.
    Parses each model name to extract sizes, quants, etc. and aggregates
    metadata fields (baseline, nsfw, tags, description) across members.

    Args:
        models_dict: Mapping of model_name → model_data dicts (mutated legacy JSON).

    Returns:
        Mapping of group_name → TextModelGroupSummary.

    """
    # Group model names by their text_model_group value
    groups: dict[str, list[str]] = {}
    for model_name, model_data in models_dict.items():
        group = str(model_data.get("text_model_group", model_name))
        if group not in groups:
            groups[group] = []
        groups[group].append(model_name)

    summaries: dict[str, TextModelGroupSummary] = {}
    for group_name, member_names in groups.items():
        parsed = [parse_text_model_name(name) for name in member_names]

        sizes: set[str] = set()
        quants: set[str] = set()
        baselines: set[str] = set()
        any_nsfw = False
        any_has_description = False
        merged_tags: set[str] = set()

        for p, mname in zip(parsed, member_names, strict=False):
            mdata = models_dict[mname]
            if p.size:
                sizes.add(p.size)
            if p.quant:
                quants.add(p.quant)
            baseline = mdata.get("baseline")
            if baseline:
                baselines.add(str(baseline))
            if mdata.get("nsfw"):
                any_nsfw = True
            if mdata.get("description"):
                any_has_description = True
            tags = mdata.get("tags")
            if isinstance(tags, list):
                merged_tags.update(str(t) for t in tags)

        format_schema = infer_name_format(member_names)

        summaries[group_name] = TextModelGroupSummary(
            group_name=group_name,
            member_count=len(member_names),
            available_sizes=sorted(sizes),
            available_quants=sorted(quants),
            common_baseline=baselines.pop() if len(baselines) == 1 else None,
            any_nsfw=any_nsfw,
            any_has_description=any_has_description,
            merged_tags=sorted(merged_tags),
            name_format=format_schema,
        )

    return summaries