#!/usr/bin/env python3
"""Part 63: abbreviated-Latin under verbose encoding — the constructive S2/S3 test.

Every entropy argument so far ran one direction: Voynich's glyph h2 (≈2.09) is
far below real language (≈3.2–3.3), and Part 34 showed the gap behaves like a
verbose-encoding artifact. The constructive test was never run: actually BUILD
the hypothesized object — medieval-style abbreviated Latin, verbose-encoded
into a 25-glyph Voynichese-like script — and measure whether it lands in
Voynich territory.

Pipeline:
  Stage 1 (Cappelli-style abbreviation of data/latin.txt):
    - 'et' -> '7' (Tironian), '-que' -> '%', per/pro/prae- -> '&'
    - terminal contraction: -us/-um/-ur -> '$'
    - nasal suppression: m/n before consonant deleted, preceding vowel -> '~'
    - truncation: words still > 7 chars cut to 6 + terminal sigla '#'
    applied to a fraction AGGR of eligible sites.
  Stage 2 (verbose encoding, 3 seeded table instantiations):
    - plaintext vowels -> single glyphs; consonants and sigla -> glyph
      DIGRAPHS over a 25-glyph alphabet (this is what 'verbose' means);
    - variant CTX: each consonant has TWO digraphs, selected by whether the
      preceding plaintext letter is a vowel (context-dependence raises
      neighbor-predictability, as Voynichese's rigid word grammar does).

Measured against Currier-B targets (same estimators as Parts 1/55):
  glyph h1 (3.87), h2 (2.09), gap h1-h2 (1.79), mean word length (5.09),
  Zipf slope (-0.78), BPE-80 profile n90 (64) and units/word (1.91), and the
  DIRECTION of the redundancy gap under BPE merging (Voynich: shrinks).

PRE-REGISTERED PASS: within ±15% on h1, h2 and gap, AND gap shrinks under
merging, AND BPE profile in syllabary territory (n90 50–120, units/word
1.5–3). Pass -> the S2/S3 family is demonstrated constructively (major
SCENARIOS update). Partial -> the unbuyable residue is the new sharpest
constraint on what verbose encoding alone cannot explain.
"""
import math, random, re
from collections import Counter
from canonical import parse
from sensitivity_test import glyph_entropies, zipf_slope
from kipchak_test import bpe_train

random.seed(23)

# ----------------------------------------------------------- stage 1
VOW = set('aeiou')

def abbreviate(words, aggr, rng):
    out = []
    for w in words:
        if w == 'et' and rng.random() < aggr:
            out.append('7'); continue
        if w.endswith('que') and len(w) > 4 and rng.random() < aggr:
            w = w[:-3] + '%'
        for pre in ('prae', 'pro', 'per'):
            if w.startswith(pre) and len(w) > len(pre) + 2 and rng.random() < aggr:
                w = '&' + w[len(pre):]
                break
        for suf in ('us', 'um', 'ur'):
            if w.endswith(suf) and len(w) > 4 and rng.random() < aggr:
                w = w[:-2] + '$'
                break
        # nasal suppression
        chars = list(w)
        for i in range(1, len(chars) - 1):
            if chars[i] in 'mn' and chars[i - 1] in VOW and chars[i + 1] not in VOW \
               and chars[i + 1].isalpha() and rng.random() < aggr:
                chars[i - 1:i + 1] = ['~']
                break
        w = ''.join(chars)
        if len(w) > 7 and rng.random() < aggr:
            w = w[:6] + '#'
        out.append(w)
    return out

# ----------------------------------------------------------- stage 2
GLYPHS = list('abcdefghijklmnopqrstuvxyz')  # 25, EVA-like inventory
PVOW = {'a': 'o', 'e': 'a', 'i': 'e', 'o': 'y', 'u': 'i'}  # vowel -> 1 glyph

def make_table(seed, ctx):
    rng = random.Random(seed)
    cons = [c for c in 'bcdfghjklmnpqrstvxz'] + list('7%&$~#')
    pool = []
    C = [g for g in GLYPHS if g not in set(PVOW.values())]
    for g1 in C:
        for g2 in GLYPHS:
            pool.append(g1 + g2)
    rng.shuffle(pool)
    table = {}
    k = 0
    for c in cons:
        if ctx:
            table[c] = (pool[k], pool[k + 1]); k += 2
        else:
            table[c] = (pool[k], pool[k]); k += 1
    return table

def encode(words, table):
    out = []
    for w in words:
        s = []
        prev_v = True
        for ch in w:
            if ch in PVOW:
                s.append(PVOW[ch]); prev_v = True
            elif ch in table:
                s.append(table[ch][0 if prev_v else 1]); prev_v = False
            else:
                prev_v = False
        if s:
            out.append(''.join(s))
    return out

# ----------------------------------------------------------- measures
def bpe_profile(tokens, merges=80, cap=15000):
    seqs = bpe_train(list(tokens[:cap]), merges)
    inv = Counter(u for s in seqs for u in s)
    tot = sum(inv.values())
    run = n90 = 0
    for _, c in inv.most_common():
        run += c; n90 += 1
        if run >= 0.9 * tot:
            break
    return n90, tot / len(seqs)

def unit_gap(tokens, merges, cap=12000):
    """h1-h2 over the unit stream after `merges` BPE merges ('.'-separated)."""
    seqs = bpe_train(list(tokens[:cap]), merges)
    stream = []
    for s in seqs:
        stream.extend(s); stream.append('.')
    c1 = Counter(stream); n1 = len(stream)
    h1 = -sum(c / n1 * math.log2(c / n1) for c in c1.values())
    c2 = Counter(zip(stream, stream[1:])); n2 = n1 - 1
    h12 = -sum(c / n2 * math.log2(c / n2) for c in c2.values())
    return h1, h12 - h1, h1 - (h12 - h1)

def report(tokens, name, targets=None):
    h1, h2 = glyph_entropies(tokens[:30000])
    zp = zipf_slope(tokens[:30000])
    ml = sum(len(w) for w in tokens[:30000]) / len(tokens[:30000])
    n90, upw = bpe_profile(tokens)
    g0 = unit_gap(tokens, 0)[2]
    g80 = unit_gap(tokens, 80)[2]
    direction = 'SHRINKS' if g80 < g0 - 0.05 else ('grows' if g80 > g0 + 0.05 else 'flat')
    print(f'{name:<26} h1={h1:5.2f} h2={h2:5.2f} gap={h1-h2:5.2f} '
          f'wlen={ml:4.2f} zipf={zp:5.2f} n90={n90:4d} u/wd={upw:4.2f} '
          f'gap(0merge)={g0:4.2f}->gap(80)={g80:4.2f} {direction}')
    return dict(h1=h1, h2=h2, gap=h1 - h2, dirn=direction, n90=n90, upw=upw)

if __name__ == '__main__':
    toks, _ = parse()
    B = [t['word'] for t in toks if t['valid'] and t['is_text'] and t['lang'] == 'B']
    lat = re.findall(r'[a-z]+', open('data/latin.txt', encoding='utf-8',
                                     errors='replace').read().lower())[:40000]
    print('targets first, then plain Latin, then the constructed objects:\n')
    tgt = report(B, 'Voynich B (target)')
    report(lat, 'Latin plain')
    rng = random.Random(5)
    verdicts = []
    for aggr in (0.5, 1.0):
        ab = abbreviate(lat, aggr, random.Random(31))
        for seed in (1, 2, 3):
            for ctx in (False, True):
                enc = encode(ab, make_table(seed, ctx))
                tag = f'abbrLat a={aggr} s={seed}{" ctx" if ctx else ""}'
                r = report(enc, tag)
                ok_h = all(abs(r[k] - tgt[k]) / tgt[k] <= 0.15 for k in ('h1', 'h2', 'gap'))
                ok = ok_h and r['dirn'] == 'SHRINKS' and 50 <= r['n90'] <= 120 \
                     and 1.5 <= r['upw'] <= 3.0
                verdicts.append((tag, ok_h, r['dirn'], ok))
    print('\nPRE-REGISTERED VERDICT (h-stats within 15% / gap-shrink / syllabary profile):')
    for tag, ok_h, d, ok in verdicts:
        print(f'  {tag:<26} h-stats:{"PASS" if ok_h else "fail"}  '
              f'merge-direction:{d}  overall:{"PASS" if ok else "fail"}')