Source code for loanpy.cluster

"""Phoneme clustering: CV grouping, glide clustering, and gap collapsing."""



[docs]
class Cluster:
    """Static helpers for segment clustering in CLDF pipelines.

    Clustering reduces fine-grained segment lists to coarser units used in
    alignment and correspondence mining (e.g. ``f.l`` for consonant clusters,
    ``a.ʊ`` for vowel sequences).

    Examples
    --------
    Typical workflow during CLDF conversion::

        segments = form.split()
        cv = dataset.get_cv_profile(form)
        clusters = Cluster.cv(segments, cv)
        glides = Cluster.glides(segments, cv)

    After pairwise alignment, gaps may be collapsed::

        alm_a, alm_b = Cluster.gaps(alm_a, alm_b)

    Notes
    -----
    Used in **CLDF conversion** scripts (``cldfbench_*.py``) for datasets such as
    UEW-hu, SeimaTurbino-hu, UESz-year-origin, and WestOldTurkic, where clustered
    segments are written to ``forms.csv`` columns like ``Clusters`` or
    ``Cluster_cv``.
    """


[docs]
    @staticmethod
    def cv(segments: list[str], cv_profile: list[str]) -> list[str]:
        """Join adjacent segments that share the same C/V class.

        Parameters
        ----------
        segments:
            IPA (or other) segments, one symbol per list element.
        cv_profile:
            Parallel list of ``"C"`` and ``"V"`` labels.

        Returns
        -------
        list[str]
            Clustered segments joined with ``"."`` within each run of C or V.

        Examples
        --------
        >>> Cluster.cv(["f", "l", "a"], ["C", "C", "V"])
        ['f.l', 'a']
        """
        result = []
        for i, (segment, cv) in enumerate(zip(segments, cv_profile)):
            if i == 0 or cv != cv_profile[i - 1]:
                result.append(segment)
            else:
                result[-1] += "." + segment
        return result



[docs]
    @staticmethod
    def glides(
        segments: list[str],
        cv_profile: list[str],
        cluster_between_vowels: tuple[str, ...] = ("ɣ", "w", "v", "β", "ð"),
        cluster_after_l: tuple[str, ...] = ("t͡ʃ", "d"),
    ) -> list[str]:
        """Cluster glides/liquids between vowels and selected consonants after ``l``.

        Parameters
        ----------
        segments, cv_profile:
            Parallel segment and C/V lists (same length).
        cluster_between_vowels:
            Segments to attach to a preceding vowel cluster when sandwiched by vowels.
        cluster_after_l:
            Segments to attach when immediately following ``l``.

        Returns
        -------
        list[str]
            Further clustered segment list.

        Raises
        ------
        ValueError
            If ``segments`` and ``cv_profile`` differ in length.

        Notes
        -----
        Used in **CLDF conversion** (e.g. UESz-year-origin ``Cluster_glide`` column,
        WestOldTurkic and koeblergothic ``Clusters``). Default glide symbols include
        Gothic intervocalic ``β`` and ``ð``.
        """
        if len(segments) != len(cv_profile):
            raise ValueError("segments and cv_profile must have the same length")
        cluster2 = []
        profile2 = []
        for idx, phoneme in enumerate(segments):
            if (
                idx != 0
                and phoneme in cluster_between_vowels
                and cv_profile[idx - 1] == "V"
            ):
                cluster2[-1] += f".{phoneme}"
                profile2[-1] += f".{cv_profile[idx]}"
            elif (
                idx != 0
                and phoneme in cluster_after_l
                and len(cluster2) > 0
                and cluster2[-1] == "l"
            ):
                cluster2[-1] += f".{phoneme}"
                profile2[-1] += f".{cv_profile[idx]}"
            else:
                cluster2.append(phoneme)
                profile2.append(cv_profile[idx])

        cluster3 = []
        for idx, phoneme in enumerate(cluster2):
            if (
                idx != 0
                and profile2[idx] == "V"
                and any(f".{ph}" in cluster3[-1] for ph in cluster_between_vowels)
            ):
                cluster3[-1] += f".{phoneme}"
            else:
                cluster3.append(phoneme)
        return cluster3



[docs]
    @staticmethod
    def gaps(seqA: list[str], seqB: list[str]) -> tuple[list[str], list[str]]:
        """Collapse consecutive gaps on ``seqB`` into a single gap per position.

        When two adjacent positions in ``seqB`` are gaps (``"-"``), the matching
        symbol in ``seqA`` is merged into the previous token. Trailing gaps may
        introduce a ``"+"`` marker in ``seqA``.

        Parameters
        ----------
        seqA, seqB:
            Parallel aligned token lists.

        Returns
        -------
        tuple[list[str], list[str]]
            Collapsed alignment pair.

        Notes
        -----
        Used in **CLDF conversion** for WestOldTurkic (``Monogap`` alignments) after
        global pairwise alignment.
        """
        seqA_new, seqB_new = [], []
        for idx, (tokA, tokB) in enumerate(zip(seqA, seqB)):
            if idx != 0 and tokB == "-" and seqB_new[-1] == "-":
                seqA_new[-1] += f".{tokA}"
            else:
                seqA_new.append(tokA)
                seqB_new.append(tokB)
        if seqB_new[-1] == "-":
            seqA_new.insert(-1, "+")
            seqB_new.pop(-1)
        return seqA_new, seqB_new