#!/usr/bin/env python3
"""Decisive control: does topic structure survive WITHIN a single scribe's hand?
If one scribe's vocabulary shifts with illustration type, 'topic' is not reducible
to scribal dialect. Uses $H (hand) and $I (illustration) page variables, all of B,
and also checks the A/B-spanning case across the whole manuscript.
"""
import re, math, random
from collections import Counter, defaultdict

DATA = "/Users/arcandledger/taxdome/ancient-texts/data"
valid = lambda w: re.fullmatch(r'[a-z]+', w) is not None

def load_pages():
    pages_meta, order = {}, []
    page_tokens = defaultdict(list)
    locus_re = re.compile(r'^<(f[0-9a-zA-Z]+)\.([^,>]+),\s*([@+=*~$&!])(\w+?)(\d*)>\s*(.*)$')
    hdr = re.compile(r'^<(f[0-9a-zA-Z]+)>')
    cur = None
    for raw in open(f"{DATA}/ZL3b-n.txt", encoding='utf-8'):
        raw = raw.rstrip('\n')
        if not raw or raw.startswith('#'): continue
        m = hdr.match(raw)
        if m and not locus_re.match(raw):
            cur = m.group(1); pages_meta[cur] = dict(re.findall(r'\$(\w)=(\w+)', raw))
            order.append(cur); continue
        m = locus_re.match(raw)
        if not m: continue
        page, _, _, ltype, _, text = m.groups()
        if not ltype.upper().startswith('P'): continue
        t = re.sub(r'<!.*?>', '', text)
        t = re.sub(r'<->', '?', t); t = re.sub(r'<%>|<\$>|<@\w+>', '', t)
        t = re.sub(r'@\d+;', '?', t)
        for _ in range(4):
            t = re.sub(r'\[([^:\[\]]*):[^\[\]]*\]', r'\1', t)
        t = t.replace(',', '.'); t = re.sub(r'[!%]', '', t)
        page_tokens[page].extend(w for w in t.split('.') if w and valid(w))
    return pages_meta, order, page_tokens

pages_meta, order, page_tokens = load_pages()

# hand x section token table (whole manuscript)
table = defaultdict(lambda: defaultdict(int))
pagecount = defaultdict(lambda: defaultdict(int))
for p in order:
    v = pages_meta.get(p, {})
    h, i, L = v.get('H','?'), v.get('I','?'), v.get('L','?')
    if not page_tokens[p]: continue
    table[h][(L,i)] += len(page_tokens[p])
    pagecount[h][(L,i)] += 1

print("Hand x (Language, Section) token counts:")
for h in sorted(table):
    cells = '  '.join(f"{L}/{i}:{n}" for (L,i), n in sorted(table[h].items(), key=lambda x:-x[1]) if n >= 300)
    print(f"  hand {h}: {cells}")

def jsd(c1, c2):
    n1, n2 = sum(c1.values()), sum(c2.values())
    s = 0.0
    for k in set(c1) | set(c2):
        p = c1.get(k,0)/n1; q = c2.get(k,0)/n2; m = (p+q)/2
        if p: s += 0.5*p*math.log2(p/m)
        if q: s += 0.5*q*math.log2(q/m)
    return s

def sample_tokens(pgs, N, rng):
    pgs = pgs[:]; rng.shuffle(pgs)
    out = []
    for p in pgs:
        out.extend(page_tokens[p])
        if len(out) >= N: break
    return Counter(out[:N])

def compare(pgsA, pgsB, label, trials=25, seed=3):
    nA = sum(len(page_tokens[p]) for p in pgsA)
    nB = sum(len(page_tokens[p]) for p in pgsB)
    N = min(nA, nB)//2 - 50
    if N < 700:
        print(f"  {label}: too few tokens ({nA} vs {nB})"); return
    rng = random.Random(seed)
    bet = [jsd(sample_tokens(pgsA, N, rng), sample_tokens(pgsB, N, rng)) for _ in range(trials)]
    wit = []
    for _ in range(trials):
        side = pgsA if rng.random() < 0.5 else pgsB
        pp = side[:]; rng.shuffle(pp); half = len(pp)//2
        if sum(len(page_tokens[p]) for p in pp[:half]) < N or \
           sum(len(page_tokens[p]) for p in pp[half:]) < N: continue
        wit.append(jsd(sample_tokens(pp[:half], N, rng), sample_tokens(pp[half:], N, rng)))
    b = sum(bet)/len(bet); w = sum(wit)/len(wit) if wit else float('nan')
    print(f"  {label}: between={b:.4f}  within={w:.4f}  ratio={b/w:.3f}  (N={N}/side)")

print("\n=== SAME HAND, DIFFERENT SECTIONS (the decisive comparisons) ===")
done = 0
for h in sorted(table):
    combos = [(L,i) for (L,i), n in table[h].items() if n >= 1500 and i != '?']
    for x in range(len(combos)):
        for y in range(x+1, len(combos)):
            (L1,i1), (L2,i2) = combos[x], combos[y]
            pgsA = [p for p in order if pages_meta.get(p,{}).get('H')==h
                    and pages_meta.get(p,{}).get('L')==L1 and pages_meta.get(p,{}).get('I')==i1]
            pgsB = [p for p in order if pages_meta.get(p,{}).get('H')==h
                    and pages_meta.get(p,{}).get('L')==L2 and pages_meta.get(p,{}).get('I')==i2]
            compare(pgsA, pgsB, f"hand {h}: {L1}/{i1} vs {L2}/{i2}")
            done += 1
if not done:
    print("  (no hand spans two sections with enough text)")

print("\n=== SAME SECTION, DIFFERENT HANDS (how big is pure scribe effect?) ===")
done = 0
secs = defaultdict(list)
for p in order:
    v = pages_meta.get(p, {})
    if page_tokens[p]: secs[(v.get('L'), v.get('I'))].append(p)
for (L,i), pgs in secs.items():
    hands = defaultdict(list)
    for p in pgs: hands[pages_meta[p].get('H','?')].append(p)
    hh = [h for h, pp in hands.items() if sum(len(page_tokens[p]) for p in pp) >= 1500 and h != '?']
    for x in range(len(hh)):
        for y in range(x+1, len(hh)):
            compare(hands[hh[x]], hands[hh[y]], f"section {L}/{i}: hand {hh[x]} vs hand {hh[y]}")
            done += 1
if not done: print("  (no section has two hands with enough text)")
