#!/usr/bin/env python3
"""Anatomy of the Voynich repetition anomaly: are adjacent repeats grammatical
(class-specific, structured) or artifactual (frequency-proportional)?
Null model: within-line shuffle (preserves line composition and burstiness)."""
import re, math, random
from collections import Counter, defaultdict

BASE = "/Users/arcandledger/taxdome/ancient-texts"
valid = lambda w: re.fullmatch(r'[a-z]+', w) is not None

def load_lines(lang='B'):
    pages, cur, lines = {}, None, []
    locus_re = re.compile(r'^<(f[0-9a-zA-Z]+)\.([^,>]+),\s*([@+=*~$&!])(\w+?)(\d*)>\s*(.*)$')
    hdr = re.compile(r'^<(f[0-9a-zA-Z]+)>')
    for raw in open(f"{BASE}/data/ZL3b-n.txt", encoding='utf-8'):
        raw = raw.rstrip('\n')
        if not raw or raw.startswith('#'): continue
        m = hdr.match(raw)
        if m and not locus_re.match(raw):
            cur = m.group(1); pages[cur] = dict(re.findall(r'\$(\w)=(\w+)', raw)); continue
        m = locus_re.match(raw)
        if not m: continue
        page, _, _, ltype, _, text = m.groups()
        if not ltype.upper().startswith('P'): continue
        v = pages.get(page, {})
        if lang and v.get('L') != lang: continue
        t = re.sub(r'<!.*?>', '', text)
        t = re.sub(r'<->', '?', t); t = re.sub(r'<%>|<\$>|<@\w+>', '', t)
        t = re.sub(r'@\d+;', '?', t)
        for _ in range(4):
            t = re.sub(r'\[([^:\[\]]*):[^\[\]]*\]', r'\1', t)
        t = t.replace(',', '.'); t = re.sub(r'[!%]', '', t)
        ws = [w for w in t.split('.') if w and valid(w)]
        if len(ws) >= 2: lines.append(dict(ws=ws, page=page, sec=v.get('I','?')))
    return lines

lines = load_lines('B')
N = sum(len(l['ws']) for l in lines)

def count_repeats(lns):
    pairs = Counter(); runs = Counter(); total_adj = 0
    for l in lns:
        ws = l['ws'] if isinstance(l, dict) else l
        total_adj += len(ws)-1
        for a, b in zip(ws, ws[1:]):
            if a == b: pairs[a] += 1
        i = 0
        while i < len(ws):
            j = i
            while j+1 < len(ws) and ws[j+1] == ws[i]: j += 1
            if j > i: runs[j-i+1] += 1
            i = j+1
    return pairs, runs, total_adj

obs_pairs, obs_runs, n_adj = count_repeats(lines)
n_obs = sum(obs_pairs.values())

# within-line shuffle null
rng = random.Random(13)
null_tot, null_by_word, null_runs = [], Counter(), Counter()
REPS = 30
for _ in range(REPS):
    sl = []
    for l in lines:
        ws = l['ws'][:]; rng.shuffle(ws); sl.append(ws)
    p, r, _ = count_repeats(sl)
    null_tot.append(sum(p.values()))
    null_by_word += p; null_runs += r
null_mean = sum(null_tot)/REPS
null_sd = (sum((x-null_mean)**2 for x in null_tot)/REPS)**0.5 or 1e-9

print(f"Currier B: {N} tokens, {n_adj} adjacent pairs")
print(f"Observed exact adjacent repeats: {n_obs}  ({1000*n_obs/n_adj:.1f}/1000)")
print(f"Within-line-shuffle null: {null_mean:.1f} ± {null_sd:.1f}  -> z = {(n_obs-null_mean)/null_sd:.1f}")
print(f"Run lengths observed: {dict(sorted(obs_runs.items()))}")
print(f"Run lengths null (mean over {REPS}): " +
      str({k: round(v/REPS, 1) for k, v in sorted(null_runs.items())}))

# which words repeat more than the null predicts?
print("\nTop repeating words (obs vs null-expected):")
freq = Counter(w for l in lines for w in l['ws'])
rows = []
for w, c in obs_pairs.most_common(60):
    e = null_by_word[w]/REPS
    if c >= 3:
        rows.append((c - e, w, c, e, freq[w]))
rows.sort(reverse=True)
for d, w, c, e, f in rows[:14]:
    print(f"  {w:<10} obs={c:<3} null={e:.1f}  freq={f}")

def suffix_class(w):
    for s in ['eedy','edy','dy','eey','ey','aiin','ain','ol','or','al','ar','am','y','o','l','r','n','s']:
        if w.endswith(s) and len(w) > len(s): return s
    return '-'

# class-level concentration
cls_obs, cls_null, cls_freq = Counter(), Counter(), Counter()
for w, c in obs_pairs.items(): cls_obs[suffix_class(w)] += c
for w, c in null_by_word.items(): cls_null[suffix_class(w)] += c/REPS
for w, c in freq.items(): cls_freq[suffix_class(w)] += c
print("\nRepeats by suffix class (obs / null / share of corpus):")
for s, c in cls_obs.most_common(8):
    print(f"  -{s:<5} obs={c:<3} null={cls_null[s]:.1f}  corpus share={100*cls_freq[s]/N:.1f}%")

# position within line & section rates
pos_first = pos_mid = pos_last = 0
sec_rep, sec_adj = Counter(), Counter()
for l in lines:
    ws = l['ws']
    sec_adj[l['sec']] += len(ws)-1
    for i, (a, b) in enumerate(zip(ws, ws[1:])):
        if a == b:
            sec_rep[l['sec']] += 1
            if i == 0: pos_first += 1
            elif i == len(ws)-2: pos_last += 1
            else: pos_mid += 1
print(f"\nRepeat pair position in line: first={pos_first} mid={pos_mid} last={pos_last}")
print("Repeat rate by section (/1000):", {s: round(1000*sec_rep[s]/sec_adj[s], 1) for s in sec_adj if sec_adj[s] > 500})

# near-repeats: direction of change
import functools
def lev(a, b):
    if a == b: return 0
    prev = list(range(len(b)+1))
    for i in range(1, len(a)+1):
        cur = [i]+[0]*len(b)
        for j in range(1, len(b)+1):
            cur[j] = min(prev[j]+1, cur[j-1]+1, prev[j-1]+(a[i-1]!=b[j-1]))
        prev = cur
    return prev[-1]
longer = shorter = same_len = 0
ops = Counter()
for l in lines:
    ws = l['ws']
    for a, b in zip(ws, ws[1:]):
        if a != b and abs(len(a)-len(b)) <= 1 and lev(a, b) == 1:
            if len(b) < len(a): shorter += 1
            elif len(b) > len(a): longer += 1
            else: same_len += 1
print(f"\nNear-repeat pairs (edit distance 1): second word shorter={shorter}, longer={longer}, same-length={same_len}")

# Finnish comparison: what repeats there?
fin = re.findall(r'[^\W\d_]+', open(f"{BASE}/data/fi_clean.txt", encoding='utf-8').read().lower())[:N]
fl, i = [], 0
r2 = random.Random(1)
while i < len(fin):
    n = r2.randint(6, 12); fl.append(dict(ws=fin[i:i+n], sec='-')); i += n
fp, fr, fa = count_repeats(fl)
print(f"\nFinnish comparison: {sum(fp.values())} repeats ({1000*sum(fp.values())/fa:.1f}/1000); top: {fp.most_common(6)}")