#!/usr/bin/env python3
"""Statistical analysis of the Voynich manuscript (ZL3b transliteration, IVTFF format)
against natural-language controls (Latin, English) and a random-gibberish control.
Pure stdlib. Outputs a readable report to stdout and JSON to results.json.
"""
import re, math, json, random, sys
from collections import Counter, defaultdict

DATA = "/Users/arcandledger/taxdome/ancient-texts/data"

# ---------------- IVTFF parsing ----------------
def parse_ivtff(path):
    pages = {}          # page -> dict of $vars
    lines = []          # records: dict(page, lang, illust, hand, locus_type, par_first, words, raw)
    cur_page, cur_vars = None, {}
    page_hdr = re.compile(r'^<(f[0-9a-zA-Z]+)>\s*(?:<!\s*(.*?)>)?')
    locus_re = re.compile(r'^<(f[0-9a-zA-Z]+)\.([^,>]+),\s*([@+=*~$&!])(\w+?)(\d*)>\s*(.*)$')
    for raw in open(path, encoding='utf-8', errors='replace'):
        raw = raw.rstrip('\n')
        if not raw or raw.startswith('#'):
            continue
        m = page_hdr.match(raw)
        if m and '.' not in m.group(1) + '':
            # page header like <f1r>  <! $Q=A $L=A ...>
            if not locus_re.match(raw):
                cur_page = m.group(1)
                cur_vars = dict(re.findall(r'\$(\w)=(\w+)', raw))
                pages[cur_page] = cur_vars
                continue
        m = locus_re.match(raw)
        if not m:
            continue
        page, locnum, gen, ltype, lsub, text = m.groups()
        v = pages.get(page, {})
        par_first = '<%>' in text
        # clean text
        t = text
        t = re.sub(r'<!.*?>', '', t)          # inline comments
        t = re.sub(r'<->', '?', t)            # illegible chunk
        t = re.sub(r'<%>|<\$>|<@\w+>', '', t) # paragraph markers
        t = re.sub(r'@\d+;', '?', t)          # rare-glyph codes -> placeholder
        # alternate readings [a:b:...] -> first option
        for _ in range(4):
            t = re.sub(r'\[([^:\[\]]*):[^\[\]]*\]', r'\1', t)
        t = t.replace(',', '.')               # uncertain space -> space
        t = re.sub(r'[!%]', '', t)            # padding
        words = [w for w in t.split('.') if w]
        lines.append(dict(page=page, lang=v.get('L'), illust=v.get('I'),
                          hand=v.get('H'), ltype=ltype, par_first=par_first,
                          words=words))
    return pages, lines

# ---------------- helpers ----------------
def shannon(counter):
    n = sum(counter.values())
    return -sum(c/n * math.log2(c/n) for c in counter.values())

def cond_entropy_chars(text):
    """h1 and h2 (conditional bigram entropy) over the char stream incl. space."""
    uni, bi = Counter(text), Counter(zip(text, text[1:]))
    h1 = shannon(uni)
    n2 = sum(bi.values())
    # H(X2|X1) = H(X1,X2) - H(X1)
    h12 = -sum(c/n2 * math.log2(c/n2) for c in bi.values())
    return h1, h12 - shannon(Counter(text[:-1]))

def zipf_slope(freqs, maxrank=1000):
    fr = sorted(freqs, reverse=True)[:maxrank]
    pts = [(math.log(r+1), math.log(f)) for r, f in enumerate(fr)]
    n = len(pts); sx = sum(x for x,_ in pts); sy = sum(y for _,y in pts)
    sxx = sum(x*x for x,_ in pts); sxy = sum(x*y for x,y in pts)
    return (n*sxy - sx*sy) / (n*sxx - sx*sx)

def levenshtein(a, b, cap=None):
    if a == b: return 0
    la, lb = len(a), len(b)
    if cap is not None and abs(la-lb) > cap: return cap+1
    prev = list(range(lb+1))
    for i in range(1, la+1):
        cur = [i] + [0]*lb
        for j in range(1, lb+1):
            cur[j] = min(prev[j]+1, cur[j-1]+1, prev[j-1] + (a[i-1]!=b[j-1]))
        prev = cur
    return prev[lb]

def adjacent_similarity(tokens, seed=42):
    """Mean normalized edit distance of consecutive word pairs vs shuffled baseline."""
    pairs = [(a,b) for a,b in zip(tokens, tokens[1:])]
    def mean_nd(ps):
        s = n = 0
        for a,b in ps:
            s += levenshtein(a,b)/max(len(a),len(b)); n += 1
        return s/n
    real = mean_nd(pairs)
    sh = tokens[:]; random.Random(seed).shuffle(sh)
    base = mean_nd(list(zip(sh, sh[1:])))
    ident = sum(1 for a,b in pairs if a==b)/len(pairs)
    near = sum(1 for a,b in pairs if a!=b and levenshtein(a,b,2)<=1)/len(pairs)
    return real, base, ident, near

def neighbor_connectivity(vocab):
    """Fraction of word types with >=1 neighbor at edit distance 1 (deletion-neighborhood method)."""
    buckets = defaultdict(set)
    vs = set(vocab)
    for w in vs:
        buckets[w].add(w)
        for i in range(len(w)):
            buckets[w[:i]+w[i+1:]].add(w)
    connected = 0; total_nb = 0
    for w in vs:
        cand = set()
        keys = [w] + [w[:i]+w[i+1:] for i in range(len(w))]
        for k in keys:
            cand |= buckets.get(k, set())
        nb = sum(1 for c in cand if c != w and levenshtein(c, w, 1) == 1)
        if nb: connected += 1
        total_nb += nb
    return connected/len(vs), total_nb/len(vs)

def word_stats(tokens):
    lens = Counter(len(w) for w in tokens)
    n = len(tokens)
    mean = sum(l*c for l,c in lens.items())/n
    var = sum((l-mean)**2*c for l,c in lens.items())/n
    dist = {l: lens.get(l,0)/n for l in range(1, 13)}
    return mean, math.sqrt(var), dist

def tokenize_control(path, limit_chars=None):
    t = open(path, encoding='utf-8').read().lower()
    if limit_chars: t = t[:limit_chars]
    return re.findall(r'[a-z]+', t)

# ---------------- load corpora ----------------
print("Parsing IVTFF ...", file=sys.stderr)
pages, lines = parse_ivtff(f"{DATA}/ZL3b-n.txt")

def valid(w): return re.fullmatch(r'[a-z]+', w) is not None

# running text only (paragraph loci), labels separately
text_lines = [l for l in lines if l['ltype'].upper().startswith('P')]
label_lines = [l for l in lines if l['ltype'].upper().startswith('L')]

all_tokens   = [w for l in text_lines for w in l['words'] if valid(w)]
tok_A        = [w for l in text_lines if l['lang']=='A' for w in l['words'] if valid(w)]
tok_B        = [w for l in text_lines if l['lang']=='B' for w in l['words'] if valid(w)]
lab_tokens   = [w for l in label_lines for w in l['words'] if valid(w)]

lat = tokenize_control(f"{DATA}/latin.txt")
eng = tokenize_control(f"{DATA}/english.txt")

# gibberish control: random words, same alphabet size & word-length distribution as Voynich
rng = random.Random(7)
vw_lens = [len(w) for w in all_tokens]
alpha = sorted(set(''.join(all_tokens)))
gib = [''.join(rng.choice(alpha) for _ in range(L)) for L in vw_lens]

corpora = {'Voynich': all_tokens, 'Voynich-A': tok_A, 'Voynich-B': tok_B,
           'Latin': lat, 'English': eng, 'Random': gib}

R = {}
print("Computing global stats ...", file=sys.stderr)
for name, toks in corpora.items():
    c = Counter(toks)
    text = ' '.join(toks)
    h1, h2 = cond_entropy_chars(text)
    mean, sd, dist = word_stats(toks)
    R[name] = dict(tokens=len(toks), types=len(c), alphabet=len(set(text))-1,
                   ttr=len(c)/len(toks), zipf=zipf_slope(list(c.values())),
                   h1=h1, h2=h2, wlen_mean=mean, wlen_sd=sd, wlen_dist=dist,
                   top10=c.most_common(10),
                   top100_cov=sum(f for _,f in c.most_common(100))/len(toks))

print("Adjacent-word similarity ...", file=sys.stderr)
for name in ['Voynich', 'Latin', 'English', 'Random']:
    toks = corpora[name][:40000]
    real, base, ident, near = adjacent_similarity(toks)
    R[name].update(adj_real=real, adj_base=base, adj_ident=ident, adj_near=near)

print("Neighbor connectivity ...", file=sys.stderr)
nv = len(Counter(all_tokens))
for name in ['Voynich', 'Latin', 'English', 'Random']:
    vocab = [w for w,_ in Counter(corpora[name]).most_common(nv)]
    conn, mean_nb = neighbor_connectivity(vocab)
    R[name].update(net_conn=conn, net_mean_nb=mean_nb, net_vocab=len(vocab))

# ---------------- A vs B comparison ----------------
cA, cB = Counter(tok_A), Counter(tok_B)
topA = set(w for w,_ in cA.most_common(100)); topB = set(w for w,_ in cB.most_common(100))
jacc = len(topA & topB)/len(topA | topB)
sig = {}
for w in ['daiin','chol','chor','shol','qokeedy','qokedy','chedy','shedy','qokain','qokaiin','ol','aiin','dy']:
    sig[w] = (cA.get(w,0)/max(1,len(tok_A))*1000, cB.get(w,0)/max(1,len(tok_B))*1000)

# ---------------- line-position effects ----------------
def line_effects(lns):
    first_last = dict(first_len=[], mid_len=[], last_len=[],
                      last_m=0, last_n=0, other_m=0, other_n=0,
                      pf_par=0, par_words=0, pf_oth=0, oth_words=0,
                      gallows_first_par=0, par_lines=0, gallows_first_oth=0, oth_lines=0)
    for l in lns:
        ws = [w for w in l['words'] if valid(w)]
        if len(ws) < 3: continue
        first_last['first_len'].append(len(ws[0]))
        first_last['last_len'].append(len(ws[-1]))
        first_last['mid_len'] += [len(w) for w in ws[1:-1]]
        first_last['last_m'] += ws[-1].endswith('m'); first_last['last_n'] += 1
        for w in ws[:-1]:
            first_last['other_m'] += w.endswith('m'); first_last['other_n'] += 1
        # p/f concentration in paragraph-initial lines
        key = ('pf_par','par_words') if l['par_first'] else ('pf_oth','oth_words')
        first_last[key[0]] += sum(1 for w in ws if 'p' in w or 'f' in w)
        first_last[key[1]] += len(ws)
        gk = ('gallows_first_par','par_lines') if l['par_first'] else ('gallows_first_oth','oth_lines')
        first_last[gk[0]] += ws[0][0] in 'tkpf'
        first_last[gk[1]] += 1
    return first_last

fl = line_effects(text_lines)

# ---------------- output ----------------
def fmt(x): return f"{x:.3f}" if isinstance(x, float) else str(x)
print("\n=== GLOBAL STATISTICS " + "="*50)
cols = ['tokens','types','alphabet','ttr','zipf','h1','h2','wlen_mean','wlen_sd','top100_cov']
print(f"{'corpus':<11}" + ''.join(f"{c:>11}" for c in cols))
for name in corpora:
    print(f"{name:<11}" + ''.join(f"{fmt(R[name][c]):>11}" for c in cols))

print("\nWord-length distribution (% of tokens):")
print(f"{'len':<11}" + ''.join(f"{l:>7}" for l in range(1,13)))
for name in ['Voynich','Latin','English','Random']:
    d = R[name]['wlen_dist']
    print(f"{name:<11}" + ''.join(f"{100*d[l]:>7.1f}" for l in range(1,13)))

print("\n=== ADJACENT-WORD SIMILARITY (lower = more similar neighbours) " + "="*8)
print(f"{'corpus':<11}{'real':>9}{'shuffled':>10}{'ratio':>8}{'P(same)':>9}{'P(dist<=1)':>11}")
for name in ['Voynich','Latin','English','Random']:
    r = R[name]
    print(f"{name:<11}{r['adj_real']:>9.3f}{r['adj_base']:>10.3f}{r['adj_real']/r['adj_base']:>8.3f}"
          f"{r['adj_ident']:>9.4f}{r['adj_near']:>11.4f}")

print("\n=== VOCABULARY SIMILARITY NETWORK (types with an edit-distance-1 neighbour) ===")
for name in ['Voynich','Latin','English','Random']:
    r = R[name]
    print(f"{name:<11} vocab={r['net_vocab']:>6}  connected={100*r['net_conn']:>5.1f}%  mean#neighbours={r['net_mean_nb']:.2f}")

print("\n=== CURRIER A vs B " + "="*50)
print(f"A: {len(tok_A)} tokens, {len(cA)} types | B: {len(tok_B)} tokens, {len(cB)} types")
print(f"Jaccard overlap of top-100 word lists: {jacc:.3f}")
print(f"{'word':<10}{'A /1000':>10}{'B /1000':>10}")
for w,(a,b) in sig.items():
    print(f"{w:<10}{a:>10.2f}{b:>10.2f}")

print("\n=== LINE-POSITION EFFECTS (running text) " + "="*30)
import statistics as st
print(f"mean word length: line-first={st.mean(fl['first_len']):.2f}  mid={st.mean(fl['mid_len']):.2f}  line-last={st.mean(fl['last_len']):.2f}")
print(f"P(word ends in 'm'): line-final={fl['last_m']/fl['last_n']:.3f}  elsewhere={fl['other_m']/fl['other_n']:.3f}")
print(f"P(word contains p/f): paragraph-initial lines={fl['pf_par']/fl['par_words']:.3f}  other lines={fl['pf_oth']/fl['oth_words']:.3f}")
print(f"P(line starts with gallows t/k/p/f): par-initial={fl['gallows_first_par']/fl['par_lines']:.3f}  other={fl['gallows_first_oth']/fl['oth_lines']:.3f}")

print(f"\nLabels: {len(lab_tokens)} tokens, {len(set(lab_tokens))} types")
print("Top Voynich words:", R['Voynich']['top10'])
print("Top A:", cA.most_common(8))
print("Top B:", cB.most_common(8))

json.dump({k: {kk: vv for kk, vv in v.items() if kk != 'wlen_dist'} for k, v in R.items()},
          open('/Users/arcandledger/taxdome/ancient-texts/results.json', 'w'), indent=1, default=str)
print("\nresults.json written")
