#!/usr/bin/env python3
"""Quantitative profile of the Linear A corpus (lineara.xyz / GORILA encoding)."""
import re, json
from collections import Counter, defaultdict

BASE = "/Users/arcandledger/taxdome/ancient-texts/lineara.xyz"

# sign -> conventional sound value (via Linear B homographs)
sign_map = dict(re.findall(r'\["(.)","([^"]+)"\]', open(f"{BASE}/ideograms.js", encoding='utf-8').read()))

src = open(f"{BASE}/LinearAInscriptions.js", encoding='utf-8').read()
_start = src.find('new Map([') + 9
inner = src[_start : src.find(']);', _start)]
inner = re.sub(r',(\s*[}\]])', r'\1', inner)   # strip JS trailing commas
inner = re.sub(r'\\\\u\{([0-9a-fA-F]+)\}', lambda m: chr(int(m.group(1), 16)), inner)  # ES6 escapes
inner = re.sub(r'\\u\{([0-9a-fA-F]+)\}', lambda m: chr(int(m.group(1), 16)), inner)
entries = json.loads('[' + inner.rstrip().rstrip(',') + ']')

LA = lambda ch: 0x10600 <= ord(ch) <= 0x1077F          # Linear A block
NUM = lambda ch: 0x10100 <= ord(ch) <= 0x1013F and ch != '\U00010101'  # Aegean numbers
SEP = '\U00010101'                                       # word separator dot

insc = []
for name, obj in entries:
    t = obj.get('parsedInscription') or ''
    insc.append(dict(name=name, site=obj.get('site',''), context=obj.get('context',''), text=t))

sites = Counter(i['site'] for i in insc)
all_chars = ''.join(i['text'] for i in insc)
sign_tokens = [c for c in all_chars if LA(c)]
num_tokens = [c for c in all_chars if NUM(c)]
uniq = Counter(sign_tokens)

def words_of(t):
    out = []
    for chunk in re.split(r'[\n' + SEP + r'\s]+', t):
        w = ''.join(c for c in chunk if LA(c))
        if len(w) >= 2:
            out.append(w)
    return out

word_site = defaultdict(set)
words = []
for i in insc:
    for w in words_of(i['text']):
        words.append(w)
        word_site[w].add(i['site'])
wc = Counter(words)

def tr(w): return '-'.join(sign_map.get(c, '?') for c in w)

with_num = sum(1 for i in insc if any(NUM(c) for c in i['text']))

print(f"Inscriptions: {len(insc)}   sites: {len(sites)}")
print(f"Top sites: {sites.most_common(6)}")
print(f"Sign tokens (syllabograms/logograms): {len(sign_tokens)}   unique signs: {len(uniq)}")
print(f"Numeral tokens: {len(num_tokens)}")
print(f"Inscriptions containing numerals: {with_num} ({100*with_num/len(insc):.0f}%)")
print(f"Sign 'words' (len>=2): {len(words)} tokens, {len(wc)} types, "
      f"hapax: {sum(1 for w,c in wc.items() if c==1)} ({100*sum(1 for w,c in wc.items() if c==1)/len(wc):.0f}% of types)")
print("\nTop 15 words (conventional sound values):")
for w, c in wc.most_common(15):
    print(f"  {tr(w):<22} x{c:<4} at {len(word_site[w])} site(s)")
print("\nWords attested at >=3 different sites (stable cross-site vocabulary):")
multi = sorted(((w, len(word_site[w]), wc[w]) for w in wc if len(word_site[w]) >= 3), key=lambda x: -x[1])
for w, ns, c in multi[:15]:
    print(f"  {tr(w):<22} sites={ns}  count={c}")
print(f"\nTotal multi-site words: {len(multi)} of {len(wc)} types "
      f"({100*len(multi)/len(wc):.1f}%)")
