Source code for loanpy.correspondences

"""Sound correspondences from aligned cognate tables."""

from __future__ import annotations

import logging
from collections import Counter, defaultdict
from collections.abc import Mapping, Sequence


def _is_alternating_language_sequence(
    table: Sequence[Mapping[str, str]],
    descendant_language_ids: set[str],
    ancestor_language_ids: set[str],
) -> bool:
    """Return True if rows strictly alternate descendant / ancestor languages."""
    if len(table) % 2:
        logging.info("Odd number of rows.")
        return False
    for index, row in enumerate(table):
        allowed = (
            descendant_language_ids if index % 2 == 0 else ancestor_language_ids
        )
        if row["Language_ID"] not in allowed:
            logging.info("Problem in row %s: %s not in %s", index, row, allowed)
            return False
    return True



[docs]
def add_separator(
    correspondences: dict[str, dict],
    sep: str = " < ",
) -> dict[str, dict]:
    """Return a copy of *correspondences* with tuple pair keys as ``\"a < b\"`` strings.

    Use when writing TOML (string keys only). In-memory scorers from
    :func:`get_sound_correspondences` use ``(descendant, ancestor)`` tuple keys.
    """
    out = dict(correspondences)

    def _stringify(key: tuple[str, str] | str) -> str:
        if isinstance(key, tuple) and len(key) == 2:
            return f"{key[0]}{sep}{key[1]}"
        return key

    for section in ("AbsoluteFrequency", "Cognateset_IDs", "Examples"):
        if section in out:
            out[section] = {_stringify(k): v for k, v in out[section].items()}
    return out




[docs]
def get_sound_correspondences(
    table: Sequence[Mapping[str, str]],
    aligned_col: str,
    prefix_descendant: str = "",
    prefix_ancestor: str = "",
) -> dict[str, dict]:
    """Extract segment correspondences from paired cognate alignment rows.

    Expects ``table`` to list cognate rows in **descendant, ancestor, descendant,
    ancestor, …** order (same convention as many CLDF ``cognates.csv`` exports).
    Each consecutive pair of rows is zipped segment-wise along ``aligned_col``.

    Parameters
    ----------
    table:
        Sequence of row dicts (e.g. from ``csv.DictReader``).
    aligned_col:
        Column with space-separated aligned segments (e.g. ``"Uralign"``).
    prefix_descendant, prefix_ancestor:
        Optional prefixes prepended to segment tokens in pair keys and examples.

    Returns
    -------
    dict
        Keys:

        * ``SoundCorrespondences`` — descendant segment → ranked ancestor segments
        * ``AbsoluteFrequency`` — ``(desc, anc)`` → count
        * ``Cognateset_IDs`` — ``(desc, anc)`` → cognate set ids
        * ``Examples`` — ``(desc, anc)`` → example alignment strings

    Examples
    --------
    Build a frequency table for alignment scoring::

        rows = list(csv.DictReader(open("cognates.csv", encoding="utf-8")))
        stats = get_sound_correspondences(rows, "Uralign")
        scorer = stats["AbsoluteFrequency"]

    Notes
    -----
    * **Quantitative analysis** — ``make_results.py`` in the Indo-Iranian–Hungarian
      study calls this on CLDF cognate tables to build TOML scorers and in-memory
      weights for :class:`~loanpy.uralign.Uralign`.
    * **CLDF workflows** — training data from any wordlist with alternating
      descendant/ancestor rows and an alignment column can be passed in; no
      hard-coded language names are required.
    """
    correspondences: dict[str, dict] = {
        key: defaultdict(list)
        for key in (
            "SoundCorrespondences",
            "AbsoluteFrequency",
            "Cognateset_IDs",
            "Examples",
        )
    }

    for index in range(0, len(table) - 1, 2):
        descendant_row, ancestor_row = table[index], table[index + 1]
        for descendant_seg, ancestor_seg in zip(
            descendant_row[aligned_col].split(),
            ancestor_row[aligned_col].split(),
        ):
            correspondences["SoundCorrespondences"][descendant_seg].append(
                ancestor_seg
            )
            pair_key = (
                f"{prefix_descendant}{descendant_seg}",
                f"{prefix_ancestor}{ancestor_seg}",
            )
            correspondences["AbsoluteFrequency"][pair_key].append(1)
            correspondences["Cognateset_IDs"][pair_key].append(
                ancestor_row["Cognateset_ID"]
            )
            example = (
                f"{prefix_descendant}{descendant_row[aligned_col]}"
                f" < {prefix_ancestor}{ancestor_row[aligned_col]}"
            )
            correspondences["Examples"][pair_key].append(example)

    correspondences["SoundCorrespondences"] = {
        descendant: [
            ancestor for ancestor, _ in Counter(ancestors).most_common()
        ]
        for descendant, ancestors in correspondences["SoundCorrespondences"].items()
    }
    correspondences["AbsoluteFrequency"] = {
        pair: sum(counts)
        for pair, counts in correspondences["AbsoluteFrequency"].items()
    }
    correspondences["AbsoluteFrequency"] = dict(
        sorted(correspondences["AbsoluteFrequency"].items(), key=lambda item: item[1])
    )
    correspondences["Cognateset_IDs"] = {
        pair: list(dict.fromkeys(ids))
        for pair, ids in correspondences["Cognateset_IDs"].items()
    }

    return correspondences