#!/usr/bin/env python3
"""Part 58: space-agnostic re-segmentation — is the structure in the glyph
stream or in the scribal spaces?

Part 37 showed word boundaries are unreliable (47% of adjacent short pairs
merge into attested types); Parts 54/55 strengthened the over-segmentation
picture. Yet every word-level result tokenizes on EVA spaces. Here Currier-B
lines are stripped of spaces and re-segmented purely statistically (BPE to a
target merge count, applied per line; lines never merged — Part 16), and the
core battery re-runs on the re-segmented units.

Segmenter validation gate (pre-registered): the same segmenter on
space-stripped Latin and English must recover true boundaries with F1 >= 0.35
(literature range for unsupervised segmentation); below that, recovered-stat
comparisons are reported but conclusions capped.

KEY READOUT: the morphosyntax split-half r on re-segmented units (original
suffix pairs where present, plus the top re-derived final-string pairs).
Survives -> the grammar is a property of the glyph stream itself, robust to
boundary noise (strengthens Part 10 considerably). Dies -> the grammar is
locked to scribal spacing — itself diagnostic (a table/grille process writes
spaces mechanically; a copied phonetic script should not have
spacing-locked grammar).
"""
import math, random, re
from collections import Counter
from canonical import parse
from generators import collect, consistency
from sensitivity_test import zipf_slope

random.seed(17)

def bpe_segment_lines(char_lines, merges):
    """BPE trained on space-free lines; returns lines of unit-strings."""
    seqs = [list(s) for s in char_lines]
    for _ in range(merges):
        pc = Counter()
        for s in seqs:
            for a, b in zip(s, s[1:]):
                pc[(a, b)] += 1
        if not pc:
            break
        (a, b), c = pc.most_common(1)[0]
        if c < 4:
            break
        ab = a + b
        for s in seqs:
            i = 0
            while i < len(s) - 1:
                if s[i] == a and s[i + 1] == b:
                    s[i:i + 2] = [ab]
                else:
                    i += 1
    return seqs

def boundary_f1(seg_lines, true_lines):
    """Precision/recall of recovered internal boundaries vs true spaces."""
    tp = fp = fn = 0
    for seg, true_ws in zip(seg_lines, true_lines):
        true_b = set()
        pos = 0
        for w in true_ws[:-1]:
            pos += len(w)
            true_b.add(pos)
        got_b = set()
        pos = 0
        for u in seg[:-1]:
            pos += len(u)
            got_b.add(pos)
        tp += len(got_b & true_b)
        fp += len(got_b - true_b)
        fn += len(true_b - got_b)
    p = tp / (tp + fp) if tp + fp else 0
    r = tp / (tp + fn) if tp + fn else 0
    f1 = 2 * p * r / (p + r) if p + r else 0
    return p, r, f1

def battery(seg_lines, name):
    flat = [u for l in seg_lines for u in l]
    ty = len(set(flat))
    ml = sum(len(u) for u in flat) / len(flat)
    zp = zipf_slope(flat)
    # re-derive candidate suffix pairs: frequent final strings sharing stems
    freq = Counter(flat)
    fins = Counter()
    for w, c in freq.items():
        for k in (1, 2, 3):
            if len(w) > k + 1:
                fins[w[-k:]] += c
    cand = [s for s, _ in fins.most_common(12)]
    pairs = []
    for i in range(len(cand)):
        for j in range(i + 1, len(cand)):
            s1, s2 = cand[i], cand[j]
            if s1.endswith(s2) or s2.endswith(s1):
                continue
            pairs.append((s1, s2))
    scored = []
    for s1, s2 in pairs:
        D = collect(seg_lines, s1, s2)
        if len(D) >= 25:
            r, ns = consistency(D)
            if not math.isnan(r):
                scored.append((s1, s2, r, ns))
    scored.sort(key=lambda x: -x[3])
    print(f'\n{name}: tokens={len(flat)} types={ty} mean_len={ml:.2f} '
          f'zipf={zp:.2f}')
    for s1, s2 in (('edy', 'ey'), ('dy', 'y')):
        D = collect(seg_lines, s1, s2)
        r, ns = consistency(D)
        print(f'  original pair {s1}/{s2:<4}: r={r:+.3f} ({ns} stems)')
    for s1, s2, r, ns in scored[:4]:
        print(f'  re-derived  {s1}/{s2:<4}: r={r:+.3f} ({ns} stems)')

if __name__ == '__main__':
    toks, lines = parse()
    B = [l['words'] for l in lines
         if l['is_text'] and l['lang'] == 'B' and len(l['words']) >= 2]

    # ---- validation gate on Latin and English ----
    for name, path in (('Latin', 'data/latin.txt'), ('English', 'data/english.txt')):
        ws = re.findall(r'[a-z]+',
                        open(path, encoding='utf-8', errors='replace').read().lower())
        nat_lines = [ws[i:i + 8] for i in range(0, 40000, 8)]
        stripped = [''.join(l) for l in nat_lines]
        seg = bpe_segment_lines(stripped, 400)
        p, r, f1 = boundary_f1(seg, nat_lines)
        print(f'VALIDATION {name}: boundary P={p:.2f} R={r:.2f} F1={f1:.2f}')

    # ---- Voynich B, space-free ----
    stripped = [''.join(l) for l in B]
    for merges in (300, 600):
        seg = bpe_segment_lines(stripped, merges)
        p, r, f1 = boundary_f1(seg, B)
        print(f'\nVoynich B BPE-{merges}: recovered-vs-scribal boundaries '
              f'P={p:.2f} R={r:.2f} F1={f1:.2f}')
        battery(seg, f'B re-segmented (BPE-{merges})')

    # baseline: the same battery on the ORIGINAL segmentation
    battery(B, '\nB original spaces (baseline)')