#!/usr/bin/env python3
"""CANONICAL Voynich parser — single source of truth for every audit test.
Parses an IVTFF transliteration into a structured token table with full
codicological metadata (quire, bifolio, hand, section, language, line, position).
Import this; do not re-implement parsing. Run directly to print the headline table.
"""
import re, sys, os
from collections import Counter, defaultdict

# Data directory: env var override, else ./data next to this file, else the
# original development path (kept last for provenance).
_here = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
DATA = os.environ.get('VOYNICH_DATA',
       _here if os.path.isdir(_here) else "/Users/arcandledger/taxdome/ancient-texts/data")
LOCUS = re.compile(r'^<(f[0-9a-zA-Z]+)\.([^,>]+),\s*([@+=*~$&!])(\w+?)(\d*)>\s*(.*)$')
HDR = re.compile(r'^<(f[0-9a-zA-Z]+)>')

def clean_text(text, alt='first', commas='break'):
    """alt: which alternate reading [a:b:...] to take ('first'|'second').
    commas: uncertain spaces ',' as word breaks ('break') or joined ('join').
    Defaults reproduce the original behavior byte-for-byte."""
    t = re.sub(r'<!.*?>', '', text)
    t = re.sub(r'<->', '?', t)
    t = re.sub(r'<%>|<\$>|<@\w+>', '', t)
    t = re.sub(r'@\d+;', '?', t)
    for _ in range(4):
        if alt == 'second':
            # [a:b] or [a:b:c] -> b; falls back to first when no 2nd option
            t = re.sub(r'\[([^:\[\]]*):([^:\[\]]*)(?::[^\[\]]*)?\]', r'\2', t)
        else:
            t = re.sub(r'\[([^:\[\]]*):[^\[\]]*\]', r'\1', t)   # -> first
    t = t.replace(',', '.' if commas == 'break' else '')
    t = re.sub(r'[!%]', '', t)
    return t

def parse(path=f"{DATA}/ZL3b-n.txt", alphabet='eva', alt='first', commas='break'):
    """Returns list of token dicts and list of line dicts.
    token: {word, page, quire, bifolio, hand, section, lang, locus_type,
            line_id, pos, line_len, is_label, par_first}
    Only [a-z]+ tokens kept as 'valid' (flag on each)."""
    pages = {}
    tokens, lines = [], []
    cur = None
    line_id = 0
    for raw in open(path, encoding='utf-8', errors='replace'):
        raw = raw.rstrip('\n')
        if not raw or raw.startswith('#'):
            continue
        m = HDR.match(raw)
        if m and not LOCUS.match(raw):
            cur = m.group(1)
            pages[cur] = dict(re.findall(r'\$(\w)=(\w+)', raw))
            continue
        m = LOCUS.match(raw)
        if not m:
            continue
        page, locus, gen, ltype, lsub, text = m.groups()
        v = pages.get(page, {})
        par_first = '<%>' in text
        ws = [w for w in clean_text(text, alt=alt, commas=commas).split('.') if w]
        is_label = ltype.upper().startswith('L')
        is_text = ltype.upper().startswith('P')
        if not ws:
            continue
        line_id += 1
        valid_ws = [w for w in ws if re.fullmatch(r'[a-z]+', w)]
        lines.append(dict(line_id=line_id, page=page, quire=v.get('Q'), hand=v.get('H'),
                          section=v.get('I'), lang=v.get('L'), bifolio=v.get('B'),
                          ltype=ltype, is_label=is_label, is_text=is_text,
                          par_first=par_first, words=valid_ws, raw_words=ws))
        for i, w in enumerate(ws):
            tokens.append(dict(word=w, valid=bool(re.fullmatch(r'[a-z]+', w)),
                               page=page, quire=v.get('Q'), bifolio=v.get('B'),
                               hand=v.get('H'), section=v.get('I'), lang=v.get('L'),
                               ltype=ltype, is_label=is_label, is_text=is_text,
                               line_id=line_id, pos=i, line_len=len(ws),
                               par_first=par_first))
    return tokens, lines

def running_B(lines):
    """Convenience: list of word-lists for Currier B running text."""
    return [l['words'] for l in lines if l['is_text'] and l['lang'] == 'B' and len(l['words']) >= 2]

if __name__ == '__main__':
    path = sys.argv[1] if len(sys.argv) > 1 else f"{DATA}/ZL3b-n.txt"
    toks, lines = parse(path)
    valid = [t for t in toks if t['valid']]
    text = [t for t in valid if t['is_text']]
    labels = [t for t in valid if t['is_label']]
    def stats(ts, name):
        words = [t['word'] for t in ts]
        c = Counter(words)
        nlen = sum(len(w) for w in words)/len(words) if words else 0
        print(f"  {name:<22} tokens={len(words):>6}  types={len(c):>5}  mean_len={nlen:.2f}")
    print(f"CANONICAL TOKEN TABLE  ({path.split('/')[-1]})")
    print(f"Total loci-tokens: {len(toks)}  |  valid [a-z]+: {len(valid)}")
    stats(text, 'running text (all)')
    stats([t for t in text if t['lang']=='A'], 'running text Currier A')
    stats([t for t in text if t['lang']=='B'], 'running text Currier B')
    stats(labels, 'labels')
    bywords = Counter(t['word'] for t in text)
    alpha = set(''.join(bywords))
    print(f"  glyph alphabet (text): {len(alpha)}  ->  {''.join(sorted(alpha))}")
    print(f"  quires: {sorted(set(t['quire'] for t in text if t['quire']))}")
    print(f"  hands: {sorted(set(t['hand'] for t in text if t['hand']))}")
    print(f"  sections: {sorted(set(t['section'] for t in text if t['section']))}")
