#!/usr/bin/env python3
"""Label analysis: do the ~1,100 words written on drawings behave like
catalog references (within-page index clusters), like names/captions
(echoed in the page's text), or like generated filler?
"""
import re, random
from collections import Counter, defaultdict

DATA = "/Users/arcandledger/taxdome/ancient-texts/data"
valid = lambda w: re.fullmatch(r'[a-z]+', w) is not None

def clean(text):
    t = re.sub(r'<!.*?>', '', text)
    t = re.sub(r'<->', '?', t); t = re.sub(r'<%>|<\$>|<@\w+>', '', t)
    t = re.sub(r'@\d+;', '?', t)
    for _ in range(4):
        t = re.sub(r'\[([^:\[\]]*):[^\[\]]*\]', r'\1', t)
    t = t.replace(',', '.'); t = re.sub(r'[!%]', '', t)
    return [w for w in t.split('.') if w]

pages_meta, labels, text_words = {}, defaultdict(list), defaultdict(list)
locus_re = re.compile(r'^<(f[0-9a-zA-Z]+)\.([^,>]+),\s*([@+=*~$&!])(\w+?)(\d*)>\s*(.*)$')
hdr = re.compile(r'^<(f[0-9a-zA-Z]+)>')
cur = None
for raw in open(f"{DATA}/ZL3b-n.txt", encoding='utf-8'):
    raw = raw.rstrip('\n')
    if not raw or raw.startswith('#'): continue
    m = hdr.match(raw)
    if m and not locus_re.match(raw):
        cur = m.group(1); pages_meta[cur] = dict(re.findall(r'\$(\w)=(\w+)', raw)); continue
    m = locus_re.match(raw)
    if not m: continue
    page, _, _, ltype, _, text = m.groups()
    ws = [w for w in clean(text) if valid(w)]
    if ltype.upper().startswith('L'): labels[page].extend(ws)
    elif ltype.upper().startswith('P'): text_words[page].extend(ws)

sec = lambda p: pages_meta.get(p, {}).get('I', '?')
NAMES = dict(H='herbal', A='astronomical', Z='zodiac', B='biological', C='cosmological',
             P='pharmaceutical', S='stars/recipes', T='text-only')

lab_tokens = [(p, w) for p, ws in labels.items() for w in ws]
lab_types = set(w for _, w in lab_tokens)
txt_vocab = set(w for ws in text_words.values() for w in ws)
print(f"Label tokens: {len(lab_tokens)}, types: {len(lab_types)}, on {len(labels)} pages")
bysec = Counter(sec(p) for p, _ in lab_tokens)
print("By section:", {NAMES.get(k,k): v for k, v in bysec.most_common()})

# ---------- 1. labels vs text vocabulary profile ----------
txt_tokens = [w for ws in text_words.values() for w in ws]
fc_lab = Counter(w[0] for _, w in lab_tokens)
fc_txt = Counter(w[0] for w in txt_tokens)
nl, nt = sum(fc_lab.values()), sum(fc_txt.values())
print("\nFirst-glyph profile (% of tokens), labels vs running text:")
chars = sorted(set(fc_lab) | set(fc_txt), key=lambda c: -fc_lab.get(c,0))[:8]
print("  char : " + '  '.join(f"{c:>5}" for c in chars))
print("  label: " + '  '.join(f"{100*fc_lab.get(c,0)/nl:>5.1f}" for c in chars))
print("  text : " + '  '.join(f"{100*fc_txt.get(c,0)/nt:>5.1f}" for c in chars))
ml = sum(len(w) for _, w in lab_tokens)/len(lab_tokens)
mt = sum(len(w) for w in txt_tokens)/len(txt_tokens)
print(f"mean length: labels {ml:.2f} vs text {mt:.2f}")

# ---------- 2. do labels appear in the text? ----------
in_vocab = sum(1 for w in lab_types if w in txt_vocab)
same_page = sum(1 for p, w in lab_tokens if w in set(text_words.get(p, [])))
pages_with_text = [(p, w) for p, w in lab_tokens if len(text_words.get(p, [])) >= 30]
same_page_t = sum(1 for p, w in pages_with_text if w in set(text_words[p]))
# chance baseline: how often would a random label type hit a random page's text?
rng = random.Random(8)
hits = trials = 0
pg_list = [p for p, ws in text_words.items() if len(ws) >= 30]
for p, w in pages_with_text:
    for _ in range(20):
        q = rng.choice(pg_list)
        if q == p: continue
        hits += w in set(text_words[q]); trials += 1
print(f"\nLabel types occurring ANYWHERE in running text: {in_vocab}/{len(lab_types)} ({100*in_vocab/len(lab_types):.0f}%)")
print(f"Label tokens appearing in their OWN page's text: {same_page_t}/{len(pages_with_text)} ({100*same_page_t/len(pages_with_text):.1f}%)")
print(f"  chance (same label vs random other page's text): {100*hits/trials:.1f}%")

# ---------- 3. within-page label similarity (the catalog-number test) ----------
def lev(a, b):
    if a == b: return 0
    prev = list(range(len(b)+1))
    for i in range(1, len(a)+1):
        cur_ = [i] + [0]*len(b)
        for j in range(1, len(b)+1):
            cur_[j] = min(prev[j]+1, cur_[j-1]+1, prev[j-1] + (a[i-1]!=b[j-1]))
        prev = cur_
    return prev[-1]
nd = lambda a, b: lev(a, b)/max(len(a), len(b))
pref2 = lambda a, b: a[:2] == b[:2]

def pair_stats(pairs):
    if not pairs: return float('nan'), float('nan')
    return (sum(nd(a,b) for a,b in pairs)/len(pairs),
            sum(pref2(a,b) for a,b in pairs)/len(pairs))

lab_pages = {p: ws for p, ws in labels.items() if len(ws) >= 3}
within = []
for p, ws in lab_pages.items():
    for i in range(len(ws)):
        for j in range(i+1, len(ws)):
            within.append((ws[i], ws[j]))
across_sec, across_all = [], []
pl = list(lab_pages.items())
for _ in range(40000):
    (p1, w1), (p2, w2) = rng.sample(pl, 2)
    a, b = rng.choice(w1), rng.choice(w2)
    if sec(p1) == sec(p2): across_sec.append((a, b))
    else: across_all.append((a, b))

w_nd, w_p2 = pair_stats(within)
s_nd, s_p2 = pair_stats(across_sec)
x_nd, x_p2 = pair_stats(across_all)
print("\n=== WITHIN-PAGE LABEL SIMILARITY ===")
print(f"{'tier':<34}{'edit dist':>10}{'share 2-glyph prefix':>22}{'pairs':>8}")
print(f"{'same page':<34}{w_nd:>10.3f}{100*w_p2:>21.1f}%{len(within):>8}")
print(f"{'same section, different page':<34}{s_nd:>10.3f}{100*s_p2:>21.1f}%{len(across_sec):>8}")
print(f"{'different section':<34}{x_nd:>10.3f}{100*x_p2:>21.1f}%{len(across_all):>8}")

# benchmark: running-text words, same-page vs cross-page (self-citation magnitude)
tw_pages = {p: ws for p, ws in text_words.items() if len(ws) >= 30}
tin, tout = [], []
tp = list(tw_pages.items())
for _ in range(40000):
    p, ws = rng.choice(tp)
    i = rng.randrange(len(ws)); j = rng.randrange(len(ws))
    if i != j: tin.append((ws[i], ws[j]))
    (p1, w1), (p2, w2) = rng.sample(tp, 2)
    tout.append((rng.choice(w1), rng.choice(w2)))
ti_nd, ti_p2 = pair_stats(tin)
to_nd, to_p2 = pair_stats(tout)
print(f"\nBenchmark, running text:  same page {ti_nd:.3f} / prefix {100*ti_p2:.1f}%   "
      f"cross page {to_nd:.3f} / prefix {100*to_p2:.1f}%")
print(f"Self-citation effect in text (same/cross): {ti_nd/to_nd:.3f}")
print(f"Label clustering effect (same-page/same-section): {w_nd/s_nd:.3f}")

# per-section view of label prefix clustering
print("\nPer-section same-page prefix sharing (labels):")
for s in ['Z', 'A', 'P', 'H', 'B', 'S']:
    pw = [(a, b) for p, ws in lab_pages.items() if sec(p) == s
          for i, a in enumerate(ws) for b in ws[i+1:]]
    if len(pw) < 30: continue
    nd_, p2_ = pair_stats(pw)
    print(f"  {NAMES.get(s,s):<15} edit={nd_:.3f}  prefix2={100*p2_:.1f}%  (pairs={len(pw)})")

# example: a zodiac page's labels
zp = [p for p in lab_pages if sec(p) == 'Z'][:2]
for p in zp:
    print(f"\nExample {p} ({NAMES.get(sec(p))}, {len(lab_pages[p])} labels): {', '.join(lab_pages[p][:14])}")