#!/usr/bin/env python3
"""Part 56: long-range information — the meaning vs meaning-thin discriminator.

The honest wall (Part 44/PAPER) is that grammar, topic and layout are all
reproducible by meaningless local processes (Part 32), so meaning could not be
affirmed or denied. But local generative processes have a hard limit: they
cannot fake the DISTANCE PROFILE of information. Natural language carries
excess mutual information between words at discourse range (~5-100 words,
power-law decay; Lin & Tegmark 2017); topic structure alone carries it only at
page scale; Markov chains decay exponentially; the self-citation process has a
reuse memory whose fingerprint is measurable. Montemurro & Zanette (2013)
showed topic-scale information in the VMS but never controlled against
self-citation or class-Markov generators — that control is the new content
here, plus a within-line / cross-line / cross-page decomposition.

Estimator: plug-in MI at token distance d between low-cardinality category
streams (suffix classes, K=20, bias ~0.01 bits), reported ONLY as excess over
M=20 global-shuffle surrogates of the same stream (exact-in-expectation bias
correction); z from surrogate spread. Companion statistic immune to category
choice: word-identity repetition rate P(w_i = w_{i+d}) vs the same surrogates.

PRE-REGISTERED READS
 1. Real B shows cross-line same-page excess at d~8-64 that dies under
    line-order scrambling and is absent in self-citation + class-Markov at
    matched z -> first discourse-scale evidence; favors language-underneath.
 2. Excess exists but survives line-order scrambling (only dies under page
    scrambling) -> it is topic/burstiness, already known (Part 6); no news.
 3. Generators match real B's curve in magnitude AND shape -> MI(d) joins
    grammar as non-discriminating; meaning-thin stays fully live.
 4. Real B lacks intermediate-range excess where Latin/English/Finnish all
    show it: positive evidence AGAINST plain language-under-encoding (a
    word-preserving encoding cannot destroy discourse-scale MI).
Controls that must behave: global surrogates |z|<2 everywhere; English
positive at small d with slow decay; word-bigram generator positive at d=1
with fast decay.
"""
import math, random
from collections import Counter
from canonical import parse
from generators import (suf, gen_selfcitation, gen_class_markov,
                        gen_word_bigram)

random.seed(11)
M_SH = 20
DGRID = [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512]

# ------------------------------------------------------------ category maps
def voy_cats(words):
    cats = sorted(set(suf(w) for w in words))
    idx = {c: i for i, c in enumerate(cats)}
    return [idx[suf(w)] for w in words], len(cats)

def nat_cats(words, top=19):
    fin = Counter(w[-2:] for w in words)
    keep = {c for c, _ in fin.most_common(top)}
    cats = sorted(keep) + ['OTH']
    idx = {c: i for i, c in enumerate(cats)}
    return [idx[w[-2:]] if w[-2:] in keep else idx['OTH'] for w in words], len(cats)

# ------------------------------------------------------------ MI machinery
def mi_at(cat, K, d, mask=None):
    """Plug-in MI between cat[i] and cat[i+d]; mask: optional pair filter
    f(i) -> bool on left index."""
    J = [0] * (K * K)
    n = 0
    rng = range(len(cat) - d)
    if mask is None:
        for i in rng:
            J[cat[i] * K + cat[i + d]] += 1
        n = len(cat) - d
    else:
        for i in rng:
            if mask[i] is not None and mask[i] >= d:
                J[cat[i] * K + cat[i + d]] += 1
                n += 1
    if n < 200:
        return None
    pa = [0] * K
    pb = [0] * K
    for a in range(K):
        base = a * K
        for b in range(K):
            c = J[base + b]
            if c:
                pa[a] += c
                pb[b] += c
    mi = 0.0
    for a in range(K):
        base = a * K
        if not pa[a]:
            continue
        for b in range(K):
            c = J[base + b]
            if c:
                mi += c / n * math.log2(c * n / (pa[a] * pb[b]))
    return mi

def rep_at(words, d):
    n = len(words) - d
    return sum(words[i] == words[i + d] for i in range(n)) / n

def curve(cat, K, mask=None):
    return {d: mi_at(cat, K, d, mask) for d in DGRID}

def excess(cat, K, seed=0, mask=None, words=None):
    """Returns dict d -> (delta_mi, z) and optionally repetition excess."""
    real = curve(cat, K, mask)
    rng = random.Random(seed)
    sh_curves = []
    sh_reps = []
    base = cat[:]
    w = words[:] if words else None
    rep_real = {d: rep_at(words, d) for d in DGRID} if words else None
    for _ in range(M_SH):
        order = list(range(len(base)))
        rng.shuffle(order)
        sh = [base[i] for i in order]
        sh_curves.append(curve(sh, K, mask))
        if words:
            sw = [w[i] for i in order]
            sh_reps.append({d: rep_at(sw, d) for d in DGRID})
    out = {}
    for d in DGRID:
        vals = [c[d] for c in sh_curves if c[d] is not None]
        if real[d] is None or len(vals) < 5:
            out[d] = None
            continue
        mu = sum(vals) / len(vals)
        sd = (sum((x - mu) ** 2 for x in vals) / (len(vals) - 1)) ** 0.5 or 1e-9
        out[d] = (real[d] - mu, (real[d] - mu) / sd)
    rep = None
    if words:
        rep = {}
        for d in DGRID:
            vals = [r[d] for r in sh_reps]
            mu = sum(vals) / len(vals)
            sd = (sum((x - mu) ** 2 for x in vals) / (len(vals) - 1)) ** 0.5 or 1e-9
            rep[d] = (rep_real[d] - mu, (rep_real[d] - mu) / sd)
    return out, rep

def fmt_row(name, ex, dsel=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512)):
    cells = []
    for d in dsel:
        e = ex.get(d)
        if e is None:
            cells.append(f'{"--":>9}')
        else:
            dm, z = e
            mark = '*' if abs(z) >= 3 else ('.' if abs(z) >= 2 else ' ')
            cells.append(f'{1000 * dm:>+7.1f}{mark} ')
    return f'{name:<18}' + ''.join(cells)

# ------------------------------------------------------- corpora assembly
def b_stream():
    toks, _ = parse()
    sel = [t for t in toks if t['valid'] and t['is_text'] and t['lang'] == 'B']
    words = [t['word'] for t in sel]
    lines = [t['line_id'] for t in sel]
    pages = [t['page'] for t in sel]
    return words, lines, pages

def a_stream():
    toks, _ = parse()
    sel = [t for t in toks if t['valid'] and t['is_text'] and t['lang'] == 'A']
    return [t['word'] for t in sel]

def load_nat(path, pat=r'[a-z]+', n=None):
    import re
    t = open(path, encoding='utf-8', errors='replace').read().lower()
    ws = re.findall(pat, t)
    return ws[:n] if n else ws

def scramble_lines(words, lines, mode, seed):
    """mode 'within': shuffle word order inside each line.
       mode 'order' : shuffle line order inside each page (needs pages).
       Returns new word list."""
    rng = random.Random(seed)
    out = []
    cur, cur_id = [], None
    groups = []
    for w, lid in zip(words, lines):
        if lid != cur_id and cur:
            groups.append(cur)
            cur = []
        cur_id = lid
        cur.append(w)
    if cur:
        groups.append(cur)
    if mode == 'within':
        for g in groups:
            rng.shuffle(g)
        return [w for g in groups for w in g]
    raise ValueError(mode)

def scramble_lines_within_page(words, lines, pages, seed):
    """Shuffle LINE ORDER within each page; page order and line insides kept.
    Discriminates sequential (discourse-like) structure from page-level
    class-composition homogeneity (register/drift-like)."""
    rng = random.Random(seed)
    page_groups = []
    cur_lines, cur_line, cur_lid, cur_pid = [], [], None, None
    for w, lid, pid in zip(words, lines, pages):
        if pid != cur_pid and cur_line:
            cur_lines.append(cur_line)
            page_groups.append(cur_lines)
            cur_lines, cur_line = [], []
        elif lid != cur_lid and cur_line:
            cur_lines.append(cur_line)
            cur_line = []
        cur_lid, cur_pid = lid, pid
        cur_line.append(w)
    if cur_line:
        cur_lines.append(cur_line)
    if cur_lines:
        page_groups.append(cur_lines)
    out = []
    for pg in page_groups:
        rng.shuffle(pg)
        for l in pg:
            out.extend(l)
    return out

def scramble_units(words, unit_ids, seed):
    """Shuffle the ORDER of units (lines or pages), keep insides intact."""
    rng = random.Random(seed)
    groups, cur, cur_id = [], [], None
    for w, uid in zip(words, unit_ids):
        if uid != cur_id and cur:
            groups.append(cur)
            cur = []
        cur_id = uid
        cur.append(w)
    if cur:
        groups.append(cur)
    rng.shuffle(groups)
    return [w for g in groups for w in g]

if __name__ == '__main__':
    Bw, Bl, Bp = b_stream()
    N = len(Bw)
    print(f'Currier B stream: {N} tokens.  Excess MI in millibits over '
          f'{M_SH} global shuffles;  * |z|>=3,  . |z|>=2')
    dsel = (1, 2, 4, 8, 16, 32, 64, 128, 256, 512)
    head = f'{"corpus":<18}' + ''.join(f'{("d=" + str(d)):>9} ' for d in dsel)
    print(head)

    cat, K = voy_cats(Bw)
    exB, repB = excess(cat, K, seed=1, words=Bw)
    print(fmt_row('B real', exB))

    # structural scrambles of B (topic/discourse separators)
    for name, sw in [
        ('B shuf-in-line', scramble_lines(Bw, Bl, 'within', 21)),
        ('B shuf-line-order', scramble_units(Bw, Bl, 22)),
        ('B lines-in-page', scramble_lines_within_page(Bw, Bl, Bp, 24)),
        ('B shuf-page-order', scramble_units(Bw, Bp, 23)),
    ]:
        c2, K2 = voy_cats(sw)
        ex, _ = excess(c2, K2, seed=2)
        print(fmt_row(name, ex))

    Aw = a_stream()
    cA, KA = voy_cats(Aw)
    exA, _ = excess(cA, KA, seed=3)
    print(fmt_row('A real', exA))

    Blines_l = []
    cur, cur_id = [], None
    for w, lid in zip(Bw, Bl):
        if lid != cur_id and cur:
            Blines_l.append(cur)
            cur = []
        cur_id = lid
        cur.append(w)
    if cur:
        Blines_l.append(cur)

    gens = [
        ('self-cit s1', gen_selfcitation(N, Bw, seed=101)),
        ('self-cit s2', gen_selfcitation(N, Bw, seed=102)),
        ('classMk s1', gen_class_markov(N, Blines_l, seed=201)),
        ('classMk s2', gen_class_markov(N, Blines_l, seed=202)),
        ('wordBigram', gen_word_bigram(N, Blines_l, seed=301)),
    ]
    rep_rows = [('B real', repB)]
    for name, stream in gens:
        c2, K2 = voy_cats(stream)
        ex, rep = excess(c2, K2, seed=4, words=stream)
        print(fmt_row(name, ex))
        rep_rows.append((name, rep))

    from canonical import DATA
    for name, path, pat in [('Latin', f'{DATA}/latin.txt', r'[a-z]+'),
                            ('English', f'{DATA}/english.txt', r'[a-z]+'),
                            ('Finnish', f'{DATA}/fi_clean.txt', r'[a-zäö]+'),
                            ('Digby recipes', f'{DATA}/genre_digby.txt', r'[a-z]+')]:
        ws = load_nat(path, pat, n=N)
        c2, K2 = nat_cats(ws)
        ex, _ = excess(c2, K2, seed=5)
        print(fmt_row(name, ex))

    print()
    print('B DECOMPOSITION — same-line pairs vs all pairs (suffix MI)')
    # same_extent[i] = largest d such that i..i+d stays within one line
    n = len(Bw)
    run_end = [0] * n
    run_end[n - 1] = n - 1
    for i in range(n - 2, -1, -1):
        run_end[i] = run_end[i + 1] if Bl[i] == Bl[i + 1] else i
    same_extent = [run_end[i] - i for i in range(n)]
    exS, _ = excess(cat, K, seed=6, mask=same_extent)
    print(fmt_row('  same-line only', exS, dsel=(1, 2, 3, 4, 6, 8)))
    print(fmt_row('  all pairs', exB, dsel=(1, 2, 3, 4, 6, 8)))
    print('  (all-pairs excess persisting where same-line pairs cannot reach'
          ' = cross-line structure)')

    print()
    print('WORD-IDENTITY REPETITION EXCESS (points vs shuffle, z marks)')
    for name, rep in rep_rows:
        if rep:
            cells = []
            for d in dsel:
                dm, z = rep[d]
                mark = '*' if abs(z) >= 3 else ('.' if abs(z) >= 2 else ' ')
                cells.append(f'{100 * dm:>+7.2f}{mark} ')
            print(f'{name:<18}' + ''.join(cells))
