#!/usr/bin/env python3
"""Part 57: the Rugg table-and-grille hypothesis — the one named prior-art
generation mechanism this project never tested (Rugg 2004, Cryptologia:
Voynichese produced by sliding a 3-hole Cardan grille over a table of word
prefixes/mids/suffixes). Timm & Schinner's self-citation was tested to death
(Parts 5, 32); Rugg's mechanism was not. Here it gets the same treatment.

Construction (faithful, minimally fitted — Rugg's own claim is that a
casually-built table suffices): table columns = the empirically most frequent
B prefixes/mids/suffixes (decompose() from the generator work), rows in one
fixed random order; G grilles = random offset triples (a,b,c); the p-th word
of a run is T[(p+a)%R][pre] + T[(p+b)%R][mid] + T[(p+c)%R][suf]; the grille
switches every `switch` lines. Small random search over R, G, empty-cell
rate and switch rate, scored ONLY on surface stats (type count, Zipf, glyph
h2) — then the SAME battery the self-citation generator faced:

  B1 morphosyntax split-half r (edy/ey, dy/y)   [real B: 0.55]
  B2 adjacent near-twin rate (edit distance<=2)  [real B: high, self-cit too]
  B3 vowel-class adjacency leakage z             [real B: +11.6 / +8.3]
  B4 long-range class-MI excess at d=4..64       [real B: +8..15mb, z>=3]

PRE-REGISTERED: a grille is a PERIODIC device — expect comb artifacts in
MI(d) near the grille cycle and a deterministic column grammar that may
OVERSHOOT the real r=0.55 (real B is intermediate, which would itself be
diagnostic). Whatever the outcome, Rugg moves from 'never tested' to a
measured row in the generator ledger.
"""
import math, random, re
from collections import Counter
from canonical import parse
from generators import decompose, collect, consistency
from kipchak_test import editdist_le2, token_class, leakage_z, best_partition, vowel_seqs
from longrange_test import voy_cats, excess, fmt_row
from sensitivity_test import glyph_entropies, zipf_slope

random.seed(13)

toks, lines_all = parse()
B_lines = [l['words'] for l in lines_all
           if l['is_text'] and l['lang'] == 'B' and len(l['words']) >= 2]
Bf = [w for l in B_lines for w in l]

# column inventories from real B
cp, cm, ce = Counter(), Counter(), Counter()
for w in Bf:
    p, m, e = decompose(w)
    cp[p] += 1; cm[m] += 1; ce[e] += 1
PRE_POOL = [x for x, _ in cp.most_common(60)]
MID_POOL = [x for x, _ in cm.most_common(120)]
SUF_POOL = [x for x, _ in ce.most_common(40)]

def make_table(R, empty, rng):
    def col(pool, weights):
        return [('' if rng.random() < empty else rng.choices(pool, weights)[0])
                for _ in range(R)]
    wp = [cp[x] for x in PRE_POOL]
    wm = [cm[x] for x in MID_POOL]
    we = [ce[x] for x in SUF_POOL]
    return (col(PRE_POOL, wp), col(MID_POOL, wm), col(SUF_POOL, we))

def gen_rugg(n_tokens, R, G, empty, switch, seed):
    rng = random.Random(seed)
    Tp, Tm, Te = make_table(R, empty, rng)
    grilles = [(rng.randrange(R), rng.randrange(R), rng.randrange(R))
               for _ in range(G)]
    out_lines, made = [], 0
    g = rng.choice(grilles)
    li = 0
    p = rng.randrange(R)
    while made < n_tokens:
        if li % switch == 0:
            g = rng.choice(grilles)
            p = rng.randrange(R)
        line = []
        for _ in range(rng.randint(6, 12)):
            a, b, c = g
            w = Tp[(p + a) % R] + Tm[(p + b) % R] + Te[(p + c) % R]
            p += 1
            if w:
                line.append(w)
        if line:
            out_lines.append(line)
            made += len(line)
        li += 1
    return out_lines

# ---------------- minimal fit on surface stats only ----------------
t_types = len(set(Bf))
t_h1, t_h2 = glyph_entropies(Bf)
t_zipf = zipf_slope(Bf)

def surface_score(lines):
    flat = [w for l in lines for w in l]
    h1, h2 = glyph_entropies(flat)
    z = zipf_slope(flat)
    ty = len(set(flat))
    return (abs(h2 - t_h2) / 0.2 + abs(z - t_zipf) / 0.3 +
            abs(ty - t_types) / t_types), (ty, z, h2)

best = None
rng0 = random.Random(99)
for trial in range(60):
    R = rng0.choice([30, 40, 50, 64])
    G = rng0.choice([8, 16, 24])
    empty = rng0.choice([0.10, 0.25, 0.40])
    switch = rng0.choice([1, 2, 4])
    cand = gen_rugg(6000, R, G, empty, switch, seed=trial)
    s, st = surface_score(cand)
    if best is None or s < best[0]:
        best = (s, dict(R=R, G=G, empty=empty, switch=switch, seed=trial), st)
print(f'Real B surface targets: types={t_types} zipf={t_zipf:.2f} h2={t_h2:.3f}')
print(f'Best grille config {best[1]}  -> fit penalty {best[0]:.3f}')

cfg = best[1]
gl = gen_rugg(len(Bf), cfg['R'], cfg['G'], cfg['empty'], cfg['switch'],
              seed=1000 + cfg['seed'])
gf = [w for l in gl for w in l]
_, st = surface_score(gl)
print(f'Fitted grille output:   types={st[0]} zipf={st[1]:.2f} h2={st[2]:.3f}  '
      f'({len(gf)} tokens)\n')

print('B1 morphosyntax split-half r (real B: 0.55-0.56)')
for s1, s2 in (('edy', 'ey'), ('dy', 'y')):
    r, ns = consistency(collect(gl, s1, s2))
    rr, nr = consistency(collect(B_lines, s1, s2))
    print(f'  {s1}/{s2:<4} grille r={r:+.3f} ({ns} stems)   real r={rr:+.3f} ({nr})')

def neartwin_rate(lines):
    s = t = 0
    for l in lines:
        for a, b in zip(l, l[1:]):
            t += 1
            s += editdist_le2(a, b)
    return s / t
print(f'\nB2 adjacent near-twin rate: grille={neartwin_rate(gl):.3f}  '
      f'real={neartwin_rate(B_lines):.3f}')

seqs = vowel_seqs(Bf, set('oeay'))
inv = sorted(set(ch for s in seqs for ch in s))
_, mask = best_partition(seqs, inv)
cls = {v: bool(mask >> i & 1) for i, v in enumerate(inv)}
r = leakage_z(gl, set('oeay'), cls)
rt = leakage_z(gl, set('oeay'), cls, drop_twins=True)
print(f'\nB3 vowel-class adjacency: grille z={r["z"]:+.1f} '
      f'(no-twins z={rt["z"]:+.1f})   real: +11.6 / +8.3')

print('\nB4 long-range class-MI excess (millibits; * z>=3): ')
c, K = voy_cats(gf)
ex, _ = excess(c, K, seed=17)
print(fmt_row('  grille', ex))
print('  real B   +81.9*   +23.5*   +14.6*   +11.8*    +8.5*   +10.3*'
      '    +4.0*    +7.9*    +6.3*    +4.0*  (from Part 56)')