#!/usr/bin/env python3
"""Genre control: does record-genre text (a real recipe compendium) move toward
the Voynich fingerprint relative to novels in the same language?"""
import re, math, random
from collections import Counter, defaultdict

BASE = "/Users/arcandledger/taxdome/ancient-texts"

# reuse the typology pipeline (functions are defined before the corpora section)
src = open(f"{BASE}/typology_test.py").read()
head = src[:src.index("# --------- corpora at matched size ---------")]
ns = {}
exec(head, ns)

load_voynich_lines, chunk, trunc_lines, metrics = (ns['load_voynich_lines'], ns['chunk'],
                                                   ns['trunc_lines'], ns['metrics'])

def lang_lines(path, N):
    toks = [w.lower() for w in re.findall(r'[a-z]+', open(path, encoding='utf-8', errors='replace').read())]
    return trunc_lines(chunk(toks), N)

voy = load_voynich_lines('B')
N = min(19000, sum(len(l) for l in voy))
corpora = {
    'Voynich B': trunc_lines(voy, N),
    'English novel': lang_lines(f"{BASE}/data/english.txt", N),
    'English recipes (Digby 1669)': lang_lines(f"{BASE}/data/genre_digby.txt", N),
}

cols = ['ttr','heaps','top100','h2w','rep','mi','assort']
print(f"Matched size: {N} tokens\n")
print(f"{'corpus':<30}" + ''.join(f"{c:>9}" for c in cols))
M = {}
for name, lines in corpora.items():
    m = metrics(lines)
    M[name] = m
    print(f"{name:<30}" + ''.join(f"{m[c]:>9.3f}" for c in cols))

print("\nGenre shift within English (novel -> recipes), and gap to Voynich:")
for c in cols:
    nov, rec, voyv = M['English novel'][c], M['English recipes (Digby 1669)'][c], M['Voynich B'][c]
    direction = "toward Voynich" if abs(rec-voyv) < abs(nov-voyv) else "away"
    closed = 0.0
    if abs(nov-voyv) > 1e-9:
        closed = 100*(abs(nov-voyv)-abs(rec-voyv))/abs(nov-voyv)
    print(f"  {c:<8} novel={nov:.3f}  recipes={rec:.3f}  Voynich={voyv:.3f}   {direction} ({closed:+.0f}% of gap closed)")

# repeated-word flavor in the recipe text
toks = [w for l in corpora['English recipes (Digby 1669)'] for w in l]
reps = Counter(a for a, b in zip(toks, toks[1:]) if a == b)
print("\nWhat repeats adjacently in real recipes:", reps.most_common(8))
