#!/usr/bin/env python3
"""Part 55a: transcription-choice sensitivity. Two parsing decisions have sat
undocumented under every headline number since Part 1:
  (1) alternate readings [a:b:...] -> FIRST option taken;
  (2) uncertain spaces (',') -> treated as definite word BREAKS.
Neither had a sensitivity analysis. Here every cell of the 2x2 grid
{alt: first|second} x {commas: break|join} recomputes the headline numbers:
token/type counts, glyph h1/h2 (fixed estimator across cells), Zipf slope,
morphosyntax split-half r (edy/ey and dy/y; Part 10 battery via
generators.py), syllabary profile (BPE-80: units covering 90%, units/word),
and the Part-37 short-pair merge rate.
PRE-REGISTERED READ: a headline claim is 'transcription-fragile' if a grid
cell moves it by more than the claim's own uncertainty (r by >0.10; h2 by
>0.1 bits; merge rate by >10 points). Fragile claims get flagged in PAPER.md.
"""
import math, re
from collections import Counter
from canonical import parse
from generators import collect, consistency

def glyph_entropies(words):
    """h1/h2 over the glyph stream with '.' as word separator (fixed local
    estimator — deltas across grid cells are what matters, not the level)."""
    s = '.'.join(words)
    c1 = Counter(s)
    n1 = sum(c1.values())
    h1 = -sum(c / n1 * math.log2(c / n1) for c in c1.values())
    c2 = Counter(zip(s, s[1:]))
    n2 = sum(c2.values())
    h12 = -sum(c / n2 * math.log2(c / n2) for c in c2.values())
    return h1, h12 - h1

def zipf_slope(words, kmax=200):
    c = Counter(words).most_common(kmax)
    xs = [math.log(i + 1) for i in range(len(c))]
    ys = [math.log(v) for _, v in c]
    mx, my = sum(xs) / len(xs), sum(ys) / len(ys)
    return (sum((x - mx) * (y - my) for x, y in zip(xs, ys)) /
            sum((x - mx) ** 2 for x in xs))

def merge_rate(lines, types):
    """Part-37 statistic: adjacent pairs with a short member whose
    concatenation is an attested type (freq >= 2)."""
    ok = tot = 0
    attested = {w for w, c in types.items() if c >= 2}
    for ws in lines:
        for a, b in zip(ws, ws[1:]):
            if min(len(a), len(b)) > 3:
                continue
            tot += 1
            if a + b in attested:
                ok += 1
    return ok / tot if tot else 0.0

def bpe_profile(tokens, merges=80, cap=15000):
    from kipchak_test import bpe_train
    seqs = bpe_train(tokens[:cap], merges)
    inv = Counter(u for s in seqs for u in s)
    tot = sum(inv.values())
    run, n90 = 0, 0
    for _, c in inv.most_common():
        run += c; n90 += 1
        if run >= 0.9 * tot:
            break
    return n90, tot / len(seqs)

if __name__ == '__main__':
    print('SENSITIVITY GRID — parsing-choice robustness of headline numbers')
    hdr = (f'{"cell":<22} {"tokens":>7} {"types":>6} {"h1":>6} {"h2":>6} '
           f'{"zipf":>6} {"r edy/ey":>9} {"r dy/y":>7} {"n90":>5} '
           f'{"u/wd":>5} {"merge%":>7}')
    print(hdr)
    base = {}
    for alt in ('first', 'second'):
        for commas in ('break', 'join'):
            toks, lines = parse(alt=alt, commas=commas)
            B = [l['words'] for l in lines
                 if l['is_text'] and l['lang'] == 'B' and len(l['words']) >= 2]
            Bf = [w for l in B for w in l]
            text = [t['word'] for t in toks if t['valid'] and t['is_text']]
            h1, h2 = glyph_entropies(text)
            zp = zipf_slope(text)
            r1, n1s = consistency(collect(B, 'edy', 'ey'))
            r2, n2s = consistency(collect(B, 'dy', 'y'))
            n90, upw = bpe_profile(Bf)
            mr = merge_rate(B, Counter(Bf))
            cell = f'{alt}/{commas}'
            print(f'{cell:<22} {len(text):>7} {len(set(text)):>6} {h1:>6.3f} '
                  f'{h2:>6.3f} {zp:>6.2f} {r1:>6.3f}({n1s:>2}) {r2:>4.3f}({n2s:>2}) '
                  f'{n90:>5} {upw:>5.2f} {100*mr:>6.1f}%')
            if alt == 'first' and commas == 'break':
                base = dict(h2=h2, r1=r1, r2=r2, mr=mr)
    print()
    print('Baseline cell = first/break (the published configuration).')
    print('Flag thresholds: |dr|>0.10, |dh2|>0.1 bits, |dmerge|>10 points.')
