#!/usr/bin/env python3
"""Part 54: the Kipchak hypothesis — period-correct Turkic tests the project
never ran, prompted by two user-supplied sources: the Codex Cumanicus (Kuun
1880, OCR; c. 1330 Kipchak) and Makhmutova's study linking Tatar dialects to
it. Part 13/47 only ever tested MODERN Anatolian Turkish (Oghuz, 600 y late).

Four pre-registered tests (pass/fail logic fixed before results):

T1  VOWEL HARMONY (the sharpest code-invariant Turkic signature, untested in
    this repo). For each corpus, take word types, extract vowel sequences,
    and find the vowel bipartition maximizing the 'pure word' rate (all
    vowels of a word in one class), subject to both classes carrying >=10%
    of vowel mass. Compare against the SAME optimization run on M surrogate
    corpora (each word's vowels resampled iid from the corpus vowel
    distribution). Report z. EXPECT: Turkish/Finnish/Hungarian/Cuman z >> 3
    with the linguistically correct classes discovered; Latin/English/Greek
    z ~< 2. Voynich vowels = Sukhotin set {o,e,a,y} (Part 38).
    READ: Voynich z >= 3 with balanced classes -> first positive
    harmonic-typology evidence (Turkic/Uralic tier). Voynich z < 2 while all
    harmonic controls fire -> glyph-level harmony absent; Kipchak survives
    only under encodings that bury vowel identity inside larger units (see
    T3) or word-level codes.

T2  SEGMENTATION-LEAKAGE HARMONY. If Voynich tokens are over-segmented
    fragments of words in a harmonic language (Part 37: 47% of adjacent
    short tokens merge into attested types), harmony must leak ACROSS
    adjacent tokens: P(adjacent tokens pure & same class) > within-line
    shuffle null. Controls: Turkish as-is (no/weak effect — harmony is
    word-bounded), Turkish artificially over-segmented at syllable joints
    (strong effect), Latin over-segmented (no effect; no harmony to leak).

T3  LATENT UNIT-LEVEL HARMONY (robust to verbose encoding). On Voynich words
    as BPE-66 unit sequences (Part 36 inventory): find the unit bipartition
    maximizing within-word adjacent same-class rate (greedy flip search, 20
    restarts), z against the same search on globally unit-shuffled surrogates.
    Method validated on syllabified Turkish (must rediscover back/front
    syllable classes, z >> 3) and syllabified Latin (z ~< 2).

T4  PERIOD-KIPCHAK PROFILE. Cuman (lexicon + prose) vs Voynich on the
    code-invariant axes: syllable-inventory compactness (units covering 90%
    of mass; units/word vs Voynich 66 / 2.04), word-length distribution,
    TTR at matched N=1000, and final-bigram (suffix) concentration.

Cuman data via build_cuman.py. All Voynich parsing via canonical.py.
"""
import re, math, random, sys, unicodedata
from collections import Counter, defaultdict
from canonical import parse, DATA

random.seed(7)

# ---------------------------------------------------------------- corpora --
def strip_marks(s, keep="äöüı"):
    out = []
    for ch in s:
        if ch in keep:
            out.append(ch); continue
        d = unicodedata.normalize('NFD', ch)
        base = ''.join(c for c in d if not unicodedata.combining(c))
        out.append(base)
    return ''.join(out)

# Hungarian long front-rounded vowels must stay front after de-accenting
HU_MAP = str.maketrans({'ő':'ö','ű':'ü','á':'a','é':'e','í':'i','ó':'o','ú':'u'})

def load_tokens(path, pat, n=60000, xform=None):
    t = open(path, encoding='utf-8', errors='replace').read().lower()
    if xform:
        t = xform(t)
    return re.findall(pat, t)[:n]

def corpus_defs():
    voy_toks, voy_lines, voyA_toks, voyA_lines = voynich()
    cu_lex = [w for w in open(f'{DATA}/cuman_lexicon.txt').read().split()]
    cu_prose = open(f'{DATA}/cuman_prose.txt').read().split()
    C = {
      'Voynich-B':  dict(tokens=voy_toks, vowels=set('oeay')),
      'Voynich-B+i':dict(tokens=voy_toks, vowels=set('oeayi')),
      'Voynich-A':  dict(tokens=voyA_toks, vowels=set('oeay')),
      'Cuman-lex':  dict(tokens=cu_lex,  vowels=set('aeiouäöüy'),
                         xnorm=lambda w: strip_marks(w)),
      'Cuman-prose':dict(tokens=cu_prose, vowels=set('aeiouäöüy'),
                         xnorm=lambda w: strip_marks(w)),
      'Turkish':    dict(tokens=load_tokens(f'{DATA}/typo_tr.txt',
                         r'[a-zçğıöşü]+', xform=lambda t: strip_marks(t)),
                         vowels=set('aeıioöuü')),
      'Finnish':    dict(tokens=load_tokens(f'{DATA}/fi_clean.txt',
                         r'[a-zäö]+'), vowels=set('aeiouyäö')),
      'Hungarian':  dict(tokens=load_tokens(f'{DATA}/typo_hu.txt',
                         r'[a-záéíóöőúüű]+',
                         xform=lambda t: t.translate(HU_MAP)),
                         vowels=set('aeiouöü')),
      'Latin':      dict(tokens=load_tokens(f'{DATA}/latin.txt', r'[a-z]+'),
                         vowels=set('aeiouy')),
      'English':    dict(tokens=load_tokens(f'{DATA}/english.txt', r'[a-z]+'),
                         vowels=set('aeiouy')),
      'Greek':      dict(tokens=load_tokens(f'{DATA}/typo_el.txt',
                         r'[α-ω]+', xform=lambda t: strip_marks(t, keep='')),
                         vowels=set('αεηιουω')),
      'Czech':      dict(tokens=load_tokens(f'{DATA}/typo_cs.txt',
                         r'[a-zěščřžýáíéůú]+',
                         xform=lambda t: strip_marks(t, keep='')),
                         vowels=set('aeiouy')),
    }
    return C, voy_lines, voyA_lines

def voynich():
    toks, lines = parse()
    B = [t['word'] for t in toks if t['valid'] and t['is_text'] and t['lang'] == 'B']
    Blines = [l['words'] for l in lines if l['is_text'] and l['lang'] == 'B'
              and len(l['words']) >= 2]
    A = [t['word'] for t in toks if t['valid'] and t['is_text'] and t['lang'] == 'A']
    Alines = [l['words'] for l in lines if l['is_text'] and l['lang'] == 'A'
              and len(l['words']) >= 2]
    return B, Blines, A, Alines

# ------------------------------------------------- T1: vowel harmony (types)
def vowel_seqs(tokens, vowels, xnorm=None, cap=6000):
    types = [w for w, _ in Counter(tokens).most_common(cap)]
    seqs = []
    for w in types:
        if xnorm:
            w = xnorm(w)
        s = tuple(ch for ch in w if ch in vowels)
        if len(s) >= 2:
            seqs.append(s)
    return seqs

def best_partition(seqs, inventory, min_share=0.10, min_class=1):
    """Maximize pure-word rate over bipartitions; each class must carry
    >=min_share of vowel occurrences and >=min_class vowel TYPES (singleton
    classes capture orthographic clustering of one sign, not harmony).
    Signature-compressed for speed."""
    n = len(inventory)
    idx = {v: i for i, v in enumerate(inventory)}
    vmass = Counter()
    sig = Counter()
    for s in seqs:
        m = 0
        for ch in s:
            m |= 1 << idx[ch]
            vmass[ch] += 1
        sig[m] += 1
    total = sum(sig.values())
    massv = [vmass[v] for v in inventory]
    tmass = sum(massv)
    best = (-1.0, 0)
    for mask in range(1, 1 << (n - 1)):           # vowel 0 fixed in class B
        k1 = bin(mask).count('1')
        if k1 < min_class or n - k1 < min_class:
            continue
        m1 = sum(massv[i] for i in range(n) if mask >> i & 1)
        if m1 < min_share * tmass or (tmass - m1) < min_share * tmass:
            continue
        pure = sum(c for s, c in sig.items()
                   if (s & mask) == s or (s & mask) == 0)
        rate = pure / total
        if rate > best[0]:
            best = (rate, mask)
    return best

def harmony_z(seqs, vowels, M=25, seed=1, min_class=1):
    inventory = sorted(set(ch for s in seqs for ch in s))
    if len(inventory) < 2 * min_class or len(inventory) < 3 or len(seqs) < 100:
        return None
    real, mask = best_partition(seqs, inventory, min_class=min_class)
    pool = [ch for s in seqs for ch in s]
    rng = random.Random(seed)
    nulls = []
    for _ in range(M):
        pl = pool[:]
        rng.shuffle(pl)
        it = iter(pl)
        surro = [tuple(next(it) for _ in s) for s in seqs]
        nulls.append(best_partition(surro, inventory, min_class=min_class)[0])
    mu = sum(nulls) / M
    sd = (sum((x - mu) ** 2 for x in nulls) / (M - 1)) ** 0.5 or 1e-9
    cls1 = ''.join(v for i, v in enumerate(inventory) if mask >> i & 1)
    cls0 = ''.join(v for i, v in enumerate(inventory) if not mask >> i & 1)
    return dict(rate=real, null=mu, sd=sd, z=(real - mu) / sd,
                classes=f'{{{cls0}}} vs {{{cls1}}}', n=len(seqs))

# --------------------------------------- T2: cross-token harmony leakage --
def token_class(w, vowels, mask_classes):
    vs = [ch for ch in w if ch in vowels]
    if not vs:
        return None
    c = {mask_classes[ch] for ch in vs}
    return c.pop() if len(c) == 1 else None

def editdist_le2(a, b):
    """True if levenshtein(a,b) <= 2 (banded DP with early exit)."""
    la, lb = len(a), len(b)
    if abs(la - lb) > 2:
        return False
    prev = list(range(lb + 1))
    for i in range(1, la + 1):
        cur = [i] + [0] * lb
        rmin = cur[0]
        for j in range(1, lb + 1):
            cur[j] = min(prev[j] + 1, cur[j - 1] + 1,
                         prev[j - 1] + (a[i - 1] != b[j - 1]))
            rmin = min(rmin, cur[j])
        if rmin > 2:
            return False
        prev = cur
    return prev[lb] <= 2

def leakage_z(lines, vowels, classes, M=40, seed=2, drop_twins=False):
    """P(adjacent tokens both pure & same class) vs token-shuffle null.
    drop_twins: exclude pairs within edit distance 2 (the self-citation
    near-twin channel), isolating harmony-style agreement from copying."""
    def stat(ls):
        same = tot = 0
        for ws in ls:
            labs = [token_class(w, vowels, classes) for w in ws]
            for i in range(len(ws) - 1):
                if drop_twins and editdist_le2(ws[i], ws[i + 1]):
                    continue
                tot += 1
                if labs[i] is not None and labs[i] == labs[i + 1]:
                    same += 1
        return same / tot if tot else 0.0
    real = stat(lines)
    rng = random.Random(seed)
    # null: shuffle tokens ACROSS lines (preserves line lengths + unigram mix)
    flat = [w for ws in lines for w in ws]
    nulls = []
    for _ in range(M):
        rng.shuffle(flat)
        it = iter(flat)
        nulls.append(stat([[next(it) for _ in ws] for ws in lines]))
    mu = sum(nulls) / M
    sd = (sum((x - mu) ** 2 for x in nulls) / (M - 1)) ** 0.5 or 1e-9
    return dict(rate=real, null=mu, z=(real - mu) / sd)

def oversegment(tokens, vowels, p=0.5, seed=3):
    """Split words at random syllable joints (before a consonant that follows
    a vowel) with prob p — simulates copying-segmentation noise."""
    rng = random.Random(seed)
    lines, cur = [], []
    for w in tokens:
        parts, start = [], 0
        for i in range(1, len(w) - 1):
            if w[i - 1] in vowels and w[i] not in vowels and rng.random() < p:
                parts.append(w[start:i]); start = i
        parts.append(w[start:])
        cur.extend(x for x in parts if x)
        if len(cur) >= 8:
            lines.append(cur); cur = []
    return lines

# ----------------------------- T3: latent unit harmony on BPE-66 units ----
def bpe_train(words, merges):
    """Minimal BPE (port of syllabary_test.py bpe_train)."""
    seqs = [list(w) for w in words]
    for _ in range(merges):
        pc = Counter()
        for s in seqs:
            for a, b in zip(s, s[1:]):
                pc[(a, b)] += 1
        if not pc:
            break
        (a, b), c = pc.most_common(1)[0]
        if c < 2:
            break
        ab = a + b
        for s in seqs:
            i = 0
            while i < len(s) - 1:
                if s[i] == a and s[i + 1] == b:
                    s[i:i + 2] = [ab]
                else:
                    i += 1
    return seqs

def best_bipartition_units(seqs, units, restarts=20, seed=4):
    """Greedy flip search maximizing within-word adjacent same-class rate,
    constrained to keep 15-85% of unit types in each class (degenerate
    one-class labelings score 1.0 trivially and are disallowed)."""
    pc = Counter()
    for s in seqs:
        for a, b in zip(s, s[1:]):
            if a != 'OTH' and b != 'OTH':   # classless bucket carries no signal
                pc[(a, b)] += 1
    tot = sum(pc.values())
    if not tot:
        return 0.0
    W = defaultdict(Counter)
    for (a, b), c in pc.items():
        if a != b:
            W[a][b] += c
            W[b][a] += c
    units = sorted(units)
    k = len(units)
    lo, hi = max(1, int(0.15 * k)), min(k - 1, int(0.85 * k))
    rng = random.Random(seed)
    best = -1.0
    for _ in range(restarts):
        lab = {u: rng.random() < 0.5 for u in units}
        cnt1 = sum(lab.values())
        same = sum(c for (a, b), c in pc.items() if lab[a] == lab[b])
        improved = True
        while improved:
            improved = False
            order = units[:]
            rng.shuffle(order)
            for u in order:
                new1 = cnt1 + (-1 if lab[u] else 1)
                if not (lo <= new1 <= hi):
                    continue
                d = sum(c if lab[v] != lab[u] else -c
                        for v, c in W[u].items())
                if d > 0:
                    lab[u] = not lab[u]
                    same += d
                    cnt1 = new1
                    improved = True
        if lo <= cnt1 <= hi and same / tot > best:
            best = same / tot
    return best

def unit_harmony_z(seqs, M=12, seed=5):
    seqs = [s for s in seqs if len(s) >= 2]
    units = sorted(set(u for s in seqs for u in s) - {'OTH'})
    real = best_bipartition_units(seqs, units)
    rng = random.Random(seed)
    flat = [u for s in seqs for u in s]
    nulls = []
    for k in range(M):
        rng.shuffle(flat)
        it = iter(flat)
        surro = [[next(it) for _ in s] for s in seqs]
        nulls.append(best_bipartition_units(surro, units, restarts=8, seed=100 + k))
    mu = sum(nulls) / M
    sd = (sum((x - mu) ** 2 for x in nulls) / (M - 1)) ** 0.5 or 1e-9
    return dict(rate=real, null=mu, z=(real - mu) / sd, k=len(units))

def syllabify(w, vowels):
    """Naive max-onset-1 syllabifier: one syllable per vowel run; of each
    intervocalic consonant cluster the last consonant starts the next
    syllable, the rest close the previous one. Adequate for unit statistics."""
    isv = [ch in vowels for ch in w]
    if not any(isv):
        return [w]
    runs, i = [], 0
    while i < len(w):
        if isv[i]:
            j = i
            while j < len(w) and isv[j]:
                j += 1
            runs.append((i, j)); i = j
        else:
            i += 1
    bounds = [0]
    for (_, b), (c, _) in zip(runs, runs[1:]):
        bounds.append(c - 1 if c - b >= 1 else c)
    bounds.append(len(w))
    return [w[x:y] for x, y in zip(bounds, bounds[1:]) if y > x]

def syl_seqs(tokens, vowels, cap=6000, top=66):
    types = [w for w, _ in Counter(tokens).most_common(cap)]
    seqs = [syllabify(w, vowels) for w in types]
    seqs = [s for s in seqs if len(s) >= 2]
    keep = {u for u, _ in
            Counter(u for s in seqs for u in s).most_common(top)}
    return [[u if u in keep else 'OTH' for u in s] for s in seqs]

# ----------------------------------------------- T4: period-Kipchak profile
def ttr_boot(tokens, n=1000, B=50, seed=6):
    rng = random.Random(seed)
    if len(tokens) < n:
        return None
    vals = []
    for _ in range(B):
        i = rng.randrange(0, len(tokens) - n)
        win = tokens[i:i + n]
        vals.append(len(set(win)) / n)
    mu = sum(vals) / B
    sd = (sum((x - mu) ** 2 for x in vals) / (B - 1)) ** 0.5
    return mu, sd

def profile(tokens, vowels, name, xnorm=None, merges=80, cap=15000):
    """Token-weighted BPE-unit profile, same procedure as Part 36
    (syllabary_test.py): n90 = units covering 90% of unit-token mass.
    Run on Voynich-B this must reproduce ~66 / ~2.0 (built-in validation)."""
    toks = [xnorm(w) if xnorm else w for w in tokens][:cap]
    seqs = bpe_train(toks, merges)
    inv = Counter(u for s in seqs for u in s)
    tot = sum(inv.values())
    run, n90 = 0, 0
    for _, c in inv.most_common():
        run += c; n90 += 1
        if run >= 0.9 * tot:
            break
    upw = tot / len(seqs)
    wl = sum(len(w) for w in toks) / len(toks)
    fin2 = Counter(w[-2:] for w in toks if len(w) >= 2)
    top5fin = sum(c for _, c in fin2.most_common(5)) / sum(fin2.values())
    return dict(name=name, n90=n90, upw=upw, wlen=wl, fin5=top5fin,
                ttr=ttr_boot(toks))

# ============================================================== run it all
if __name__ == '__main__':
    from generators import gen_selfcitation, gen_class_markov
    C, voy_lines, voyA_lines = corpus_defs()

    print('=' * 72)
    print('T1  VOWEL HARMONY — optimized bipartition vs resampled-vowel null')
    print('(free = any partition; balanced = >=2 vowel types per class, the')
    print(' harmony-shaped solution space — singleton classes are orthography)')
    for label, mc in (('free', 1), ('balanced', 2)):
        print(f'-- {label} --')
        print(f'{"corpus":<12} {"n_seq":>6} {"rate":>7} {"null":>7} {"z":>7}   classes')
        for name, d in C.items():
            seqs = vowel_seqs(d['tokens'], d['vowels'], d.get('xnorm'))
            r = harmony_z(seqs, d['vowels'], min_class=mc)
            if r:
                print(f'{name:<12} {r["n"]:>6} {r["rate"]:>7.3f} {r["null"]:>7.3f} '
                      f'{r["z"]:>7.1f}   {r["classes"]}')
            else:
                print(f'{name:<12} (insufficient data)')

    print()
    print('=' * 72)
    print('T2  SEGMENTATION-LEAKAGE — adjacent-token same-class rate vs null')
    # build mask classes from T1 best partition for each corpus tested
    def classes_of(name, d):
        seqs = vowel_seqs(d['tokens'], d['vowels'], d.get('xnorm'))
        inv = sorted(set(ch for s in seqs for ch in s))
        _, mask = best_partition(seqs, inv)
        return {v: bool(mask >> i & 1) for i, v in enumerate(inv)}

    voy_cls = classes_of('Voynich-B', C['Voynich-B'])
    VV = C['Voynich-B']['vowels']
    r = leakage_z(voy_lines, VV, voy_cls)
    print(f'Voynich-B lines           rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f}')
    r = leakage_z(voy_lines, VV, voy_cls, drop_twins=True)
    print(f'Voynich-B (no near-twins) rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f}')
    vA_cls = classes_of('Voynich-A', C['Voynich-A'])
    r = leakage_z(voyA_lines, VV, vA_cls)
    print(f'Voynich-A lines           rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f}')
    # generator controls: does mechanical copying alone reproduce the effect?
    Bflat = [w for l in voy_lines for w in l]
    sc_lines = gen_selfcitation(len(Bflat), Bflat, seed=123, as_lines=True)
    r = leakage_z(sc_lines, VV, voy_cls)
    print(f'Self-citation generator   rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f}')
    r = leakage_z(sc_lines, VV, voy_cls, drop_twins=True)
    print(f'Self-cit. (no near-twins) rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f}')
    cm_lines = gen_class_markov(len(Bflat), voy_lines, seed=7, as_lines=True)
    r = leakage_z(cm_lines, VV, voy_cls)
    print(f'Class-Markov generator    rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f}')
    r = leakage_z(cm_lines, VV, voy_cls, drop_twins=True)
    print(f'Class-Mk. (no near-twins) rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f}')
    tr = C['Turkish']
    tr_cls = classes_of('Turkish', tr)
    tr_lines_norm = []
    cur = []
    for w in tr['tokens'][:30000]:
        cur.append(w)
        if len(cur) >= 8:
            tr_lines_norm.append(cur); cur = []
    r = leakage_z(tr_lines_norm, tr['vowels'], tr_cls)
    print(f'Turkish intact            rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f}')
    r = leakage_z(oversegment(tr['tokens'][:30000], tr['vowels']), tr['vowels'], tr_cls)
    print(f'Turkish over-segmented    rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f}')
    la = C['Latin']
    la_cls = classes_of('Latin', la)
    r = leakage_z(oversegment(la['tokens'][:30000], la['vowels']), la['vowels'], la_cls)
    print(f'Latin over-segmented      rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f}')
    r = leakage_z(oversegment(tr['tokens'][:30000], tr['vowels']), tr['vowels'], tr_cls,
                  drop_twins=True)
    print(f'Turk. overseg, no twins   rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f}')

    print()
    print('=' * 72)
    print('T3  LATENT UNIT HARMONY — BPE/syllable units, bipartition vs null')
    voyB = C['Voynich-B']['tokens']
    vtypes = [w for w, _ in Counter(voyB).most_common(6000)]
    vseqs = bpe_train(vtypes, 80)
    keep = {u for u, _ in Counter(u for s in vseqs for u in s).most_common(66)}
    vseqs = [[u if u in keep else 'OTH' for u in s] for s in vseqs]
    r = unit_harmony_z(vseqs)
    print(f'Voynich-B BPE-66 units    rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f} (k={r["k"]})')
    r = unit_harmony_z(syl_seqs(tr['tokens'], tr['vowels']))
    print(f'Turkish syllables         rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f} (k={r["k"]})')
    r = unit_harmony_z(syl_seqs(la['tokens'], la['vowels']))
    print(f'Latin syllables           rate={r["rate"]:.3f} null={r["null"]:.3f} z={r["z"]:+.1f} (k={r["k"]})')

    print()
    print('=' * 72)
    print('T4  PERIOD-KIPCHAK PROFILE — token-weighted BPE-80 units '
          '(Part 36 method; Voynich-B row must self-validate near 66 / 2.0)')
    print(f'{"corpus":<12} {"n90":>5} {"units/wd":>9} {"wlen":>6} '
          f'{"top5-final":>10} {"TTR@1k":>12}')
    rows = [
        profile(voyB, C['Voynich-B']['vowels'], 'Voynich-B'),
        profile(C['Voynich-A']['tokens'], C['Voynich-A']['vowels'], 'Voynich-A'),
        profile(C['Cuman-lex']['tokens'], C['Cuman-lex']['vowels'],
                'Cuman-lex', C['Cuman-lex'].get('xnorm')),
        profile(C['Cuman-prose']['tokens'], C['Cuman-prose']['vowels'],
                'Cuman-prose', C['Cuman-prose'].get('xnorm')),
        profile(tr['tokens'], tr['vowels'], 'Turkish'),
        profile(C['Finnish']['tokens'], C['Finnish']['vowels'], 'Finnish'),
        profile(la['tokens'], la['vowels'], 'Latin'),
        profile(C['English']['tokens'], C['English']['vowels'], 'English'),
    ]
    for p in rows:
        t = p['ttr']
        ttr = f'{t[0]:.3f}±{t[1]:.3f}' if t else '   (n<1000)'
        print(f'{p["name"]:<12} {p["n90"]:>5} {p["upw"]:>9.2f} {p["wlen"]:>6.2f} '
              f'{p["fin5"]:>10.3f} {ttr:>12}')