#!/usr/bin/env python3
"""Builds a Cuman (Kipchak Turkic, c. 1330) lexicon + running-text corpus from
the page-level OCR of Kuun's 1880 edition of the Codex Cumanicus
(Indiana University copy, barcode 30000132241310; public domain).

Why: the typology screen (Part 13 / Part 47) only ever tested MODERN Turkish,
which is Oghuz, not Kipchak, and 600 years too late. The Codex Cumanicus is the
period-correct Kipchak corpus (c. 1330, within a century of the VMS parchment).

Sources inside the OCR volume (image numbers, 1-based):
  - images 396-455 : Kuun's "index alphabeticus" of CUMAN words (one headword
                     per entry, every entry cites 'pag.'); image 456+ is the
                     Persian vocabulary -> excluded.
  - images 262-395 : the codex's "German part": Cuman-German glosses and the
                     CONTINUOUS Cuman texts (Pater Noster, Ave Maria, hymns,
                     riddles) interleaved with Kuun's Latin apparatus.

Method: harvest index headwords by pattern; train char-trigram models
(Cuman = headwords, Latin = data/latin.txt, German = data/de_clean.txt);
classify every token on the text pages; keep maximal runs of >=4 consecutive
Cuman-classified tokens as running text (vocabulary pages produce only short
runs and are thereby excluded automatically).

Outputs: data/cuman_lexicon.txt (one type per line),
         data/cuman_text.txt    (one extracted run per line).
"""
import re, os, math, sys
from collections import Counter
from canonical import DATA

OCR = os.path.join(DATA, 'codex_cumanicus_ocr')
INDEX_RANGE = range(396, 456)
TEXT_RANGE = range(262, 396)

LETTER = "a-zäöüčšžñçàáâéêëèíîïóôõúûůýæœ'"
TOKEN_RE = re.compile(f"^[{LETTER}]+$")

# Latin/German/editorial words that can open a line capitalised in the index
STOP = set('''pag cf imp ibid idem mend mendose scriptum pro vide index item
vel etc sic hoc loco codex cod fortasse lege adde et in ad de ab ex ar pers
germ osm kaz tag lin vocabularium excursus oratio textus quod quae qui hujus
huic apud idest grammatica notis nota glossa glossae kuun comes anno tom
fol lib cap vol seu sive nec non sed ut cum sub super inter post ante per
ich der die das und ist mir dir wir sie ein eine dem den des me te se ne
si quia sicut enim autem ergo igitur itaque'''.split())


def page(n):
    p = os.path.join(OCR, f'{n:08d}.txt')
    if not os.path.exists(p):
        return []
    return open(p, encoding='utf-8', errors='replace').read().split('\n')


def norm(tok):
    """Lowercase, strip OCR debris (superscript digits, punctuation, sigla)."""
    t = tok.lower()
    t = re.sub(r'[0-9¹²³⁴⁵⁶⁷⁸⁹°ºª*†‡()\[\]{}«»"„."‚\',;:!?<>=~|/\\_–—-]+', '', t)
    return t


def harvest_lexicon():
    head_re = re.compile(f"^([A-ZÄÖÜČŠŽÇ][{LETTER}]+)\\b")
    words = Counter()
    for n in INDEX_RANGE:
        lines = page(n)
        for i, ln in enumerate(lines):
            m = head_re.match(ln.strip())
            if not m:
                continue
            # every genuine index entry cites 'pag.' within a couple of lines
            window = ' '.join(lines[i:i + 3]).lower()
            if 'pag' not in window:
                continue
            w = norm(m.group(1))
            if len(w) >= 2 and TOKEN_RE.match(w) and w not in STOP:
                words[w] += 1
    return words


class CharModel:
    def __init__(self, words):
        self.c3, self.c2 = Counter(), Counter()
        for w, n in words.items():
            s = f'^^{w}$'
            for i in range(len(s) - 2):
                self.c3[s[i:i + 3]] += n
                self.c2[s[i:i + 2]] += n
        self.V = len(set(ch for w in words for ch in w)) + 3

    def logp(self, w):
        if not w:
            return -99.0
        s = f'^^{w}$'
        tot = 0.0
        for i in range(len(s) - 2):
            tot += math.log((self.c3[s[i:i + 3]] + 0.5) /
                            (self.c2[s[i:i + 2]] + 0.5 * self.V))
        return tot / (len(s) - 2)


def load_control(path, pat=r'[a-zäöüß]+'):
    txt = open(path, encoding='utf-8', errors='replace').read().lower()
    return Counter(re.findall(pat, txt))


def extract_text(lex):
    cu = CharModel(lex)
    la = CharModel(dict(load_control(f'{DATA}/latin.txt').most_common(8000)))
    try:
        de = CharModel(dict(load_control(f'{DATA}/de_clean.txt').most_common(8000)))
    except FileNotFoundError:
        de = la
    def lexhit(w):
        """In the index, or a plausible inflected form of an indexed stem
        (Kipchak is suffixing: stem must be a >=4-char prefix of the token)."""
        if w in lex:
            return True
        return any(w[:k] in lex for k in range(len(w) - 1, 3, -1))

    runs, cur = [], []

    def flush(n):
        nonlocal cur
        if len(cur) >= 4:
            hits = sum(lexhit(w) for w in cur) / len(cur)
            if hits >= 0.5:          # Persian/Latin/OCR-noise runs fail this
                runs.append((n, cur))
        cur = []

    for n in TEXT_RANGE:
        for ln in page(n):
            for tok in ln.split():
                w = norm(tok)
                if not w or not TOKEN_RE.match(w) or len(w) < 2 or w in STOP:
                    continue
                is_cu = (w in lex) or (
                    cu.logp(w) > la.logp(w) + 0.15 and
                    cu.logp(w) > de.logp(w) + 0.15)
                if is_cu:
                    cur.append(w)
                else:
                    flush(n)
        flush(n)
    return runs


if __name__ == '__main__':
    lex = harvest_lexicon()
    print(f'Cuman index headwords harvested: {len(lex)}')
    runs = extract_text(lex)
    toks = [w for _, r in runs for w in r]
    # prose = the codex's continuous texts only (prayers, hymns, riddles);
    # glossary pages produce word-lists that would distort token statistics
    prose = [(n, r) for n, r in runs if 300 <= n <= 381]
    ptoks = [w for _, r in prose for w in r]
    print(f'All runs: {len(runs)}  tokens: {len(toks)}  types: {len(set(toks))}')
    print(f'Prose runs (img 300-381): {len(prose)}  tokens: {len(ptoks)}  '
          f'types: {len(set(ptoks))}')
    with open(f'{DATA}/cuman_lexicon.txt', 'w') as f:
        f.write('\n'.join(sorted(lex)))
    with open(f'{DATA}/cuman_text.txt', 'w') as f:
        f.write('\n'.join(' '.join(r) for _, r in runs))
    with open(f'{DATA}/cuman_prose.txt', 'w') as f:
        f.write('\n'.join(' '.join(r) for _, r in prose))
    print('Sample prose runs:')
    for _, r in prose[:3]:
        print('  ', ' '.join(r[:14]))
    print('Sample lexicon:', ', '.join(sorted(lex)[100:115]))