#!/usr/bin/env python3
"""Indus script: ordering/notation battery on real CISI data (mayig digitization,
Parpola sign IDs, Mohenjo-daro M-1..M-199), with permutation controls, calibrated
against Voynich/Latin/English/Roman/decimal on the identical estimator.
"""
import re, json, glob, random
from collections import Counter, defaultdict

BASE = "/Users/arcandledger/taxdome/ancient-texts"

# ---------------- load Indus sequences ----------------
seqs = []
for fn in glob.glob(f"{BASE}/indus-valley-script-corpus/corpus/*/*.json"):
    try: data = json.load(open(fn))
    except Exception: continue
    for entry in data:
        gs = [g.get('id') for g in entry.get('graphemes', []) if g.get('id')]
        gs = [g for g in gs if re.fullmatch(r'P\d+[a-z]?', g)]
        if len(gs) >= 2: seqs.append(gs)
print(f"Indus: {len(seqs)} inscriptions (len>=2), {sum(map(len,seqs))} sign tokens, "
      f"{len(set(g for s in seqs for g in s))} unique signs, "
      f"mean length {sum(map(len,seqs))/len(seqs):.2f}")

# ---------------- shared machinery ----------------
def concordance(words, min_cov=0.002):
    gc = Counter(g for w in words for g in w)
    total = sum(gc.values())
    keep = {g for g, c in gc.items() if c/total >= min_cov and c >= 5}
    pos = defaultdict(list)
    for w in words:
        if len(w) < 2: continue
        for i, g in enumerate(w):
            if g in keep: pos[g].append(i/(len(w)-1))
    order = {g: sum(v)/len(v) for g, v in pos.items()}
    agree = disagree = 0
    for w in words:
        gs = [g for g in w if g in keep]
        for i in range(len(gs)):
            for j in range(i+1, len(gs)):
                a, b = gs[i], gs[j]
                if a == b: continue
                if order[a] < order[b]: agree += 1
                elif order[a] > order[b]: disagree += 1
    return agree/(agree+disagree) if agree+disagree else float('nan'), keep, order

def shuffled_baseline(words, reps=10, seed=4):
    rng = random.Random(seed)
    vals = []
    for r in range(reps):
        sw = []
        for w in words:
            w2 = list(w); rng.shuffle(w2); sw.append(w2)
        vals.append(concordance(sw)[0])
    return sum(vals)/len(vals)

def repeats(words):
    nw = sep = adj = 0
    for w in words:
        if len(w) < 2: continue
        nw += 1
        runs = defaultdict(list)
        for i, g in enumerate(w):
            if i == 0 or w[i-1] != g: runs[g].append(i)
        if any(len(v) > 1 for v in runs.values()): sep += 1
        if any(w[i] == w[i+1] for i in range(len(w)-1)): adj += 1
    return sep/nw, adj/nw

# ---------------- calibration corpora ----------------
def load_voynich(lang=None):
    pages, cur, toks = {}, None, []
    locus_re = re.compile(r'^<(f[0-9a-zA-Z]+)\.([^,>]+),\s*([@+=*~$&!])(\w+?)(\d*)>\s*(.*)$')
    hdr = re.compile(r'^<(f[0-9a-zA-Z]+)>')
    for raw in open(f"{BASE}/data/ZL3b-n.txt", encoding='utf-8'):
        raw = raw.rstrip('\n')
        if not raw or raw.startswith('#'): continue
        m = hdr.match(raw)
        if m and not locus_re.match(raw):
            cur = m.group(1); pages[cur] = dict(re.findall(r'\$(\w)=(\w+)', raw)); continue
        m = locus_re.match(raw)
        if not m: continue
        page, _, _, ltype, _, text = m.groups()
        if not ltype.upper().startswith('P'): continue
        if lang and pages.get(page, {}).get('L') != lang: continue
        t = re.sub(r'<!.*?>', '', text)
        t = re.sub(r'<->', '?', t); t = re.sub(r'<%>|<\$>|<@\w+>', '', t)
        t = re.sub(r'@\d+;', '?', t)
        for _ in range(4):
            t = re.sub(r'\[([^:\[\]]*):[^\[\]]*\]', r'\1', t)
        t = t.replace(',', '.'); t = re.sub(r'[!%]', '', t)
        toks.extend(w for w in t.split('.') if w and re.fullmatch(r'[a-z]+', w))
    return toks

GL = ['cth','ckh','cph','cfh','ch','sh']
def eva(w):
    out, i = [], 0
    while i < len(w):
        for g in GL:
            if w.startswith(g, i): out.append(g); i += len(g); break
        else: out.append(w[i]); i += 1
    return out

def roman(n):
    vals = [(1000,'m'),(900,'cm'),(500,'d'),(400,'cd'),(100,'c'),(90,'xc'),
            (50,'l'),(40,'xl'),(10,'x'),(9,'ix'),(5,'v'),(4,'iv'),(1,'i')]
    out = ''
    for v, s in vals:
        while n >= v: out += s; n -= v
    return out

rng = random.Random(11)
cal = {
    'Indus (CISI sample)': seqs,
    'Voynich (EVA)': [eva(w) for w in load_voynich()],
    'English': [list(w) for w in re.findall(r'[a-z]+', open(f"{BASE}/data/english.txt").read().lower())[:35000]],
    'Latin': [list(w) for w in re.findall(r'[a-z]+', open(f"{BASE}/data/latin.txt").read().lower())[:35000]],
    'Roman numerals': [list(roman(rng.randint(1,4999))) for _ in range(30000)],
    'Decimal digits': [list(str(rng.randint(1,99999))) for _ in range(30000)],
}

print(f"\n{'corpus':<22}{'concordance':>12}{'shuffled null':>14}{'excess':>8}{'sep-rep':>8}{'adj-rep':>8}")
res = {}
for name, words in cal.items():
    c, keep, order = concordance(words)
    b = shuffled_baseline(words)
    s, a = repeats(words)
    res[name] = (c, b, s, a, keep, order)
    print(f"{name:<22}{c:>12.3f}{b:>14.3f}{c-b:>8.3f}{s:>8.3f}{a:>8.3f}")

# ---------------- Indus positional detail ----------------
c, b, s, a, keep, order = res['Indus (CISI sample)']
gc = Counter(g for w in seqs for g in w)
print(f"\nIndus signs kept for ordering: {len(keep)}")
print("Top 12 signs: mean position (0=start, 1=end), share text-final, share text-initial:")
for g, n in gc.most_common(12):
    fin = sum(1 for w in seqs if w[-1] == g)
    ini = sum(1 for w in seqs if w[0] == g)
    mp = order.get(g, float('nan'))
    print(f"  {g:<6} n={n:<4} pos={mp:.2f}  final={fin}/{n}  initial={ini}/{n}")

# length distribution
lc = Counter(len(s_) for s_ in seqs)
print("\nInscription length distribution:", dict(sorted(lc.items())))