"""Sound correspondences from aligned cognate tables."""
from __future__ import annotations
import logging
from collections import Counter, defaultdict
from collections.abc import Mapping, Sequence
def _is_alternating_language_sequence(
table: Sequence[Mapping[str, str]],
descendant_language_ids: set[str],
ancestor_language_ids: set[str],
) -> bool:
"""Return True if rows strictly alternate descendant / ancestor languages."""
if len(table) % 2:
logging.info("Odd number of rows.")
return False
for index, row in enumerate(table):
allowed = (
descendant_language_ids if index % 2 == 0 else ancestor_language_ids
)
if row["Language_ID"] not in allowed:
logging.info("Problem in row %s: %s not in %s", index, row, allowed)
return False
return True
[docs]
def add_separator(
correspondences: dict[str, dict],
sep: str = " < ",
) -> dict[str, dict]:
"""Return a copy of *correspondences* with tuple pair keys as ``\"a < b\"`` strings.
Use when writing TOML (string keys only). In-memory scorers from
:func:`get_sound_correspondences` use ``(descendant, ancestor)`` tuple keys.
"""
out = dict(correspondences)
def _stringify(key: tuple[str, str] | str) -> str:
if isinstance(key, tuple) and len(key) == 2:
return f"{key[0]}{sep}{key[1]}"
return key
for section in ("AbsoluteFrequency", "Cognateset_IDs", "Examples"):
if section in out:
out[section] = {_stringify(k): v for k, v in out[section].items()}
return out
[docs]
def get_sound_correspondences(
table: Sequence[Mapping[str, str]],
aligned_col: str,
prefix_descendant: str = "",
prefix_ancestor: str = "",
) -> dict[str, dict]:
"""Extract segment correspondences from paired cognate alignment rows.
Expects ``table`` to list cognate rows in **descendant, ancestor, descendant,
ancestor, …** order (same convention as many CLDF ``cognates.csv`` exports).
Each consecutive pair of rows is zipped segment-wise along ``aligned_col``.
Parameters
----------
table:
Sequence of row dicts (e.g. from ``csv.DictReader``).
aligned_col:
Column with space-separated aligned segments (e.g. ``"Uralign"``).
prefix_descendant, prefix_ancestor:
Optional prefixes prepended to segment tokens in pair keys and examples.
Returns
-------
dict
Keys:
* ``SoundCorrespondences`` — descendant segment → ranked ancestor segments
* ``AbsoluteFrequency`` — ``(desc, anc)`` → count
* ``Cognateset_IDs`` — ``(desc, anc)`` → cognate set ids
* ``Examples`` — ``(desc, anc)`` → example alignment strings
Examples
--------
Build a frequency table for alignment scoring::
rows = list(csv.DictReader(open("cognates.csv", encoding="utf-8")))
stats = get_sound_correspondences(rows, "Uralign")
scorer = stats["AbsoluteFrequency"]
Notes
-----
* **Quantitative analysis** — ``make_results.py`` in the Indo-Iranian–Hungarian
study calls this on CLDF cognate tables to build TOML scorers and in-memory
weights for :class:`~loanpy.uralign.Uralign`.
* **CLDF workflows** — training data from any wordlist with alternating
descendant/ancestor rows and an alignment column can be passed in; no
hard-coded language names are required.
"""
correspondences: dict[str, dict] = {
key: defaultdict(list)
for key in (
"SoundCorrespondences",
"AbsoluteFrequency",
"Cognateset_IDs",
"Examples",
)
}
for index in range(0, len(table) - 1, 2):
descendant_row, ancestor_row = table[index], table[index + 1]
for descendant_seg, ancestor_seg in zip(
descendant_row[aligned_col].split(),
ancestor_row[aligned_col].split(),
):
correspondences["SoundCorrespondences"][descendant_seg].append(
ancestor_seg
)
pair_key = (
f"{prefix_descendant}{descendant_seg}",
f"{prefix_ancestor}{ancestor_seg}",
)
correspondences["AbsoluteFrequency"][pair_key].append(1)
correspondences["Cognateset_IDs"][pair_key].append(
ancestor_row["Cognateset_ID"]
)
example = (
f"{prefix_descendant}{descendant_row[aligned_col]}"
f" < {prefix_ancestor}{ancestor_row[aligned_col]}"
)
correspondences["Examples"][pair_key].append(example)
correspondences["SoundCorrespondences"] = {
descendant: [
ancestor for ancestor, _ in Counter(ancestors).most_common()
]
for descendant, ancestors in correspondences["SoundCorrespondences"].items()
}
correspondences["AbsoluteFrequency"] = {
pair: sum(counts)
for pair, counts in correspondences["AbsoluteFrequency"].items()
}
correspondences["AbsoluteFrequency"] = dict(
sorted(correspondences["AbsoluteFrequency"].items(), key=lambda item: item[1])
)
correspondences["Cognateset_IDs"] = {
pair: list(dict.fromkeys(ids))
for pair, ids in correspondences["Cognateset_IDs"].items()
}
return correspondences