#!/usr/bin/env python3
"""Codebook/numeral test: are Voynich words ordered like sign-value numerals?

Two discriminators, applied identically to every corpus:
1. CONCORDANCE: fix one global ordering of the glyph alphabet (by mean within-word
   position), then measure what fraction of within-word glyph pairs respect it.
   Sign-value numerals (Roman) ~ high; positional digits (decimal) ~ 0.5; languages
   in between.
2. SEPARATED REPEATS: fraction of words in which some glyph occurs twice with a
   DIFFERENT glyph in between (e.g. 'arbor': r..r). Impossible in canonical
   sign-value notation (repeats must be adjacent runs: iii, ccc); common in language.
"""
import re, math, random
from collections import Counter, defaultdict

DATA = "/Users/arcandledger/taxdome/ancient-texts/data"
valid = lambda w: re.fullmatch(r'[a-z]+', w) is not None

# ---------------- corpora ----------------
def load_voynich(lang=None):
    pages, cur, toks = {}, None, []
    locus_re = re.compile(r'^<(f[0-9a-zA-Z]+)\.([^,>]+),\s*([@+=*~$&!])(\w+?)(\d*)>\s*(.*)$')
    hdr = re.compile(r'^<(f[0-9a-zA-Z]+)>')
    for raw in open(f"{DATA}/ZL3b-n.txt", encoding='utf-8'):
        raw = raw.rstrip('\n')
        if not raw or raw.startswith('#'): continue
        m = hdr.match(raw)
        if m and not locus_re.match(raw):
            cur = m.group(1); pages[cur] = dict(re.findall(r'\$(\w)=(\w+)', raw)); continue
        m = locus_re.match(raw)
        if not m: continue
        page, _, _, ltype, _, text = m.groups()
        if not ltype.upper().startswith('P'): continue
        if lang and pages.get(page, {}).get('L') != lang: continue
        t = re.sub(r'<!.*?>', '', text)
        t = re.sub(r'<->', '?', t); t = re.sub(r'<%>|<\$>|<@\w+>', '', t)
        t = re.sub(r'@\d+;', '?', t)
        for _ in range(4):
            t = re.sub(r'\[([^:\[\]]*):[^\[\]]*\]', r'\1', t)
        t = t.replace(',', '.'); t = re.sub(r'[!%]', '', t)
        toks.extend(w for w in t.split('.') if w and valid(w))
    return toks

def roman(n):
    vals = [(1000,'m'),(900,'cm'),(500,'d'),(400,'cd'),(100,'c'),(90,'xc'),
            (50,'l'),(40,'xl'),(10,'x'),(9,'ix'),(5,'v'),(4,'iv'),(1,'i')]
    out = ''
    for v, s in vals:
        while n >= v: out += s; n -= v
    return out

rng = random.Random(11)
roman_corpus = [roman(rng.randint(1, 4999)) for _ in range(30000)]
decimal_corpus = [str(rng.randint(1, 99999)) for _ in range(30000)]

# Voynich glyph tokenizer: benched gallows and ch/sh are single glyphs in EVA
GLYPHS = ['cth','ckh','cph','cfh','ch','sh']
def eva_glyphs(w):
    out, i = [], 0
    while i < len(w):
        for g in GLYPHS:
            if w.startswith(g, i):
                out.append(g); i += len(g); break
        else:
            out.append(w[i]); i += 1
    return out

plain = lambda w: list(w)

corpora = {
    'Voynich (EVA glyphs)': ([eva_glyphs(w) for w in load_voynich()], None),
    'Latin':   ([plain(w) for w in re.findall(r'[a-z]+', open(f"{DATA}/latin.txt").read().lower())[:35000]], None),
    'English': ([plain(w) for w in re.findall(r'[a-z]+', open(f"{DATA}/english.txt").read().lower())[:35000]], None),
    'Roman numerals': ([plain(w) for w in roman_corpus], None),
    'Decimal digits': ([plain(w) for w in decimal_corpus], None),
}

# ---------------- metrics ----------------
def analyze(words):
    # glyph inventory with >=0.2% coverage
    gc = Counter(g for w in words for g in w)
    total = sum(gc.values())
    keep = {g for g, c in gc.items() if c/total >= 0.002}
    # mean within-word position (normalized)
    pos = defaultdict(list)
    for w in words:
        if len(w) < 2: continue
        for i, g in enumerate(w):
            if g in keep: pos[g].append(i/(len(w)-1))
    order = {g: sum(v)/len(v) for g, v in pos.items()}
    # concordance of within-word pairs under that ordering
    agree = disagree = 0
    sep_repeat = adj_repeat = nw = 0
    for w in words:
        if len(w) < 2: continue
        nw += 1
        seen_runs = defaultdict(list)   # glyph -> run start indices
        for i, g in enumerate(w):
            if i == 0 or w[i-1] != g:
                seen_runs[g].append(i)
        if any(len(v) > 1 for v in seen_runs.values()): sep_repeat += 1
        if any(w[i] == w[i+1] for i in range(len(w)-1)): adj_repeat += 1
        gs = [g for g in w if g in keep]
        for i in range(len(gs)):
            for j in range(i+1, len(gs)):
                a, b = gs[i], gs[j]
                if a == b: continue
                if order[a] < order[b]: agree += 1
                elif order[a] > order[b]: disagree += 1
    conc = agree/(agree+disagree) if agree+disagree else float('nan')
    return dict(conc=conc, sep=sep_repeat/nw, adj=adj_repeat/nw,
                inv=len(keep), order=sorted(order, key=order.get))

print(f"{'corpus':<22}{'concordance':>12}{'sep-repeat':>11}{'adj-repeat':>11}{'glyphs':>7}")
results = {}
for name, (words, _) in corpora.items():
    r = analyze(words)
    results[name] = r
    print(f"{name:<22}{r['conc']:>12.3f}{r['sep']:>11.3f}{r['adj']:>11.3f}{r['inv']:>7}")

print("\nInferred Voynich glyph order (early -> late within word):")
print('  ' + ' < '.join(results['Voynich (EVA glyphs)']['order']))
print("\nInferred Roman numeral order (sanity check, should be m<d<c<l<x<v<i-ish):")
print('  ' + ' < '.join(results['Roman numerals']['order']))

# Currier A and B separately — does the ordering hold in both 'languages'?
for lang in ['A', 'B']:
    words = [eva_glyphs(w) for w in load_voynich(lang)]
    r = analyze(words)
    print(f"\nVoynich Currier {lang}: concordance={r['conc']:.3f}  sep-repeat={r['sep']:.3f}")
    print('  order: ' + ' < '.join(r['order']))