Source code for loanpy.uralign

"""Descendant–ancestor alignment and correspondence-based scoring."""



[docs]
class Uralign:
    """Sequential alignment and scoring for etymological comparison.

    The API is language-pair agnostic: method names such as ``hu`` reflect
    historical use (Hungarian vs. proto-Uralic) but accept any two segment lists
    with CV profiles.

    Examples
    --------
    In a loanword-detection pipeline, align donor and recipient segments then
    score against mined correspondences::

        alm_d, alm_a = Uralign.hu(seg_d, seg_a, cv_d[0], cv_a[0])
        score = Uralign.get_score(alm_d, alm_a, scorer, freq_filter=2)

    Notes
    -----
    * **CLDF conversion** — ``Uralign.hu`` writes ``Uralign`` / ``Uralign_cluster``
      columns in cognate tables (UEW-hu, SeimaTurbino-hu).
    * **Quantitative analysis** — loanword-detection pipelines (e.g.
      Indo-Iranian–Hungarian ``make_results.py``) use ``Uralign.hu`` and
      ``Uralign.get_score`` with correspondence scorers from
      :func:`~loanpy.correspondences.get_sound_correspondences`.
    """


[docs]
    @staticmethod
    def hu(
        seqHU: list[str],
        seqPU: list[str],
        seqHU_cv0: str,
        seqPU_cv0: str,
        initial_gap: bool = True,
        final_gap: bool = True,
    ) -> tuple[list[str], list[str]]:
        """Align two segment sequences with optional initial and final gap rules.

        Parameters
        ----------
        seqHU, seqPU:
            Segment lists (modified in place when gaps are inserted).
        seqHU_cv0, seqPU_cv0:
            Word-initial C/V labels for gap decisions.
        initial_gap:
            If True and the descendant begins with a vowel, prepend ``#-`` /
            ``-`` markers.
        final_gap:
            If True, pad or cluster the longer sequence at the word edge.

        Returns
        -------
        tuple[list[str], list[str]]
            Aligned segment pair.

        Notes
        -----
        Used in **CLDF conversion** and in **make_results.py** (loanword scoring).
        """
        if initial_gap:
            if seqHU_cv0 == "V":
                seqHU.insert(0, "#-")
                if seqPU_cv0 == "V":
                    seqPU.insert(0, "-")

        if final_gap:
            diff = abs(len(seqPU) - len(seqHU))
            if len(seqHU) < len(seqPU):
                seqHU.append("-#")
                seqPU = seqPU[:-diff] + [".".join(seqPU[-diff:])]
            elif len(seqHU) > len(seqPU):
                seqHU = seqHU[:-diff] + ["+"] + [".".join(seqHU[-diff:])]
        else:
            n = min(len(seqHU), len(seqPU))
            seqHU, seqPU = seqHU[:n], seqPU[:n]
        return seqHU, seqPU



[docs]
    @staticmethod
    def get_score(
        seqA: list[str],
        seqB: list[str],
        scorer: dict[tuple[str, str], float],
        freq_filter: int = 2,
    ) -> int:
        """Sum correspondence scores along an alignment.

        For each aligned pair ``(a, b)`` the key ``(a, b)`` is looked up in
        ``scorer``. Pairs below ``freq_filter`` incur a large penalty.

        Parameters
        ----------
        seqA, seqB:
            Parallel aligned token lists.
        scorer:
            Mapping from correspondence keys to weights (often absolute
            frequencies from :func:`~loanpy.correspondences.get_sound_correspondences`).
        freq_filter:
            Minimum score for a pair to count positively.

        Returns
        -------
        int
            Aggregate alignment score.

        Notes
        -----
        Used in **make_results.py** together with scores from
        :func:`~loanpy.correspondences.get_sound_correspondences`.
        """
        score = 0
        for a, b in zip(seqA, seqB):
            local_score = scorer.get((a, b), -1000)
            if local_score >= freq_filter:
                score += local_score
            else:
                score -= 1000
        return score