#!/usr/bin/env python3
"""
iconographic_map.py — Aligns Voynich section vocabulary with medieval
medical iconographic traditions.

Reverse-engineering from the illustrations backward into the text structure:
if the sections correspond to medieval medical compendium conventions, then
the section-locked vocabulary should align with the iconographic program of
each section's illustrations.

Historical framework:
  - Section H (Herbal): Materia medica. Botanical attribute vocabulary.
  - Section B (Biological/Balneological): Thermal bath therapy. Body-domain
    vocabulary. The female figures in pools = patients undergoing prescribed
    hydrotherapy (a major Galenic therapeutic).
  - Section A/C (Astronomical/Cosmological): Medical astrology. Governs TIMING
    of treatments (which planetary hour/day to administer each compound).
  - Section Z (Zodiac): Zodiac-body correspondence. Each sign rules a body part;
    the nymphs represent the body-part patients for that sign's diseases.
  - Section P (Pharmaceutical): Prepared compounds. Jars = the products of
    distillation/extraction as described in the herbal section.
  - Section T (Text-only): Pure formulary text — compiled prescriptions without
    illustrative support.

This script tests whether the section-specific vocabulary clusters reflect
these predicted functional divisions, using Jensen-Shannon divergence between
section word distributions as the measure, plus a within-section prefix analysis
that maps to the predicted iconographic programs.

See REPORT.md Parts 6, 22-24 for the topic-locking and slot-function results.
"""
import re, math, random, os, sys
from collections import Counter, defaultdict

HERE = os.path.dirname(os.path.abspath(__file__))
ZL3B = os.path.join(HERE, "data", "ZL3b-n.txt")

if not os.path.exists(ZL3B):
    print("ERROR: data/ZL3b-n.txt not found. Run: bash fetch_data.sh")
    sys.exit(1)

valid = lambda w: bool(re.fullmatch(r'[a-z]+', w))

SECTION_LABELS = {
    'H': 'Herbal (Materia Medica)',
    'B': 'Biological/Balneological (Hydrotherapy)',
    'S': 'Stars/Recipes (Formulary)',
    'T': 'Text-only (Pure Prescription)',
    'C': 'Cosmological (Astrology timing)',
    'Z': 'Zodiac (Body-sign correspondence)',
    'P': 'Pharmaceutical (Prepared compounds)',
    'A': 'Astronomical',
}

GALENIC_ASSOCIATIONS = {
    'qok/qol': ('biological/aquatic domain', 'hydrotherapy, body fluids, female medicine'),
    'ch-':     ('dry terrestrial plants', 'hot/dry herbs, warming medicines'),
    'sh-':     ('moist terrestrial plants', 'cool/moist herbs, sedatives'),
    'ot-':     ('stellar/timed treatments', 'astrological medicine, timing of doses'),
    'ok/ol':   ('general item markers', 'particle/connective class'),
    'o-':      ('general prefix', 'unmarked category'),
    'd-':      ('terminator/closer', 'record end marker'),
    's-':      ('opener/header', 'record start marker, subject declaration'),
}

ZODIAC_BODY_PARTS = {
    'Aries': 'head/brain', 'Taurus': 'neck/throat', 'Gemini': 'arms/lungs',
    'Cancer': 'chest/stomach', 'Leo': 'heart/back', 'Virgo': 'intestines',
    'Libra': 'kidneys/bladder', 'Scorpio': 'reproductive organs',
    'Sagittarius': 'hips/thighs', 'Capricorn': 'knees/bones',
    'Aquarius': 'calves/ankles', 'Pisces': 'feet/lymph',
}


def load_pages():
    """Load all text lines with full codicological metadata."""
    pages_meta, cur, page_lines = {}, None, defaultdict(list)
    locus_re = re.compile(r'^<(f[0-9a-zA-Z]+)\.([^,>]+),\s*([@+=*~$&!])(\w+?)(\d*)>\s*(.*)$')
    hdr_re   = re.compile(r'^<(f[0-9a-zA-Z]+)>')
    for raw in open(ZL3B, encoding='utf-8', errors='replace'):
        raw = raw.rstrip('\n')
        if not raw or raw.startswith('#'):
            continue
        m = hdr_re.match(raw)
        if m and not locus_re.match(raw):
            cur = m.group(1)
            pages_meta[cur] = dict(re.findall(r'\$(\w)=(\w+)', raw))
            continue
        m = locus_re.match(raw)
        if not m:
            continue
        page, _, _, ltype, _, text = m.groups()
        if not ltype.upper().startswith('P'):
            continue
        v = pages_meta.get(page, {})
        text = re.sub(r'<!.*?>', '', text)
        text = re.sub(r'<->','?',text); text = re.sub(r'<%>|<\$>|<@\w+>','',text)
        text = re.sub(r'@\d+;','?',text)
        for _ in range(4):
            text = re.sub(r'\[([^:\[\]]*):[^\[\]]*\]', r'\1', text)
        text = text.replace(',','.').replace('!','').replace('%','')
        ws = [w for w in text.split('.') if w and valid(w)]
        if ws:
            page_lines[page].extend(ws)
    return pages_meta, page_lines


def jsd(p_counts, q_counts):
    """Jensen-Shannon divergence between two word count dicts."""
    all_words = set(p_counts) | set(q_counts)
    p_total = sum(p_counts.values()) or 1
    q_total = sum(q_counts.values()) or 1
    p = {w: p_counts.get(w,0)/p_total for w in all_words}
    q = {w: q_counts.get(w,0)/q_total for w in all_words}
    m = {w: (p[w]+q[w])/2 for w in all_words}
    def kl(a, b):
        return sum(a[w]*math.log(a[w]/b[w]) for w in all_words
                   if a[w] > 0 and b[w] > 0)
    return (kl(p, m) + kl(q, m))/2


def prefix_of(w):
    if w.startswith('qok') or w.startswith('qol'):
        return 'qok/qol'
    if w.startswith('qo'):
        return 'qo-'
    if w.startswith('ch'):
        return 'ch-'
    if w.startswith('sh'):
        return 'sh-'
    if w.startswith('ot'):
        return 'ot-'
    if w.startswith('ok') or w.startswith('ol'):
        return 'ok/ol'
    if w.startswith('o'):
        return 'o-'
    if w.startswith('da') or w.startswith('dy') or (w.startswith('d') and not w.startswith('da')):
        return 'd-'
    if w.startswith('s') and not w.startswith('sh'):
        return 's-'
    return 'other'


# ─── Load ────────────────────────────────────────────────────────────────────
print("Loading page data ...")
pages_meta, page_lines = load_pages()
if not page_lines:
    print("  No pages loaded — is data/ZL3b-n.txt a valid IVTFF transliteration?")
    print("  Run: bash fetch_data.sh")
    sys.exit(1)

# Build section→Counter
sec_vocab = defaultdict(Counter)
for pg, words in page_lines.items():
    sec = pages_meta.get(pg, {}).get('I', '?')
    for w in words:
        sec_vocab[sec][w] += 1
print(f"  Sections found: {sorted(sec_vocab.keys())}")

# ─── Test 1: Inter-section JSD matrix ────────────────────────────────────────
print("\n" + "="*70)
print("TEST 1: INTER-SECTION VOCABULARY DIVERGENCE (JSD)")
print("="*70)
print("High divergence between sections = sections discuss different subjects.")
print("Low divergence = shared vocabulary (comorbid domains, shared prefixes).")
print()
secs = sorted(sec_vocab.keys())
print(f"  {'':>28}", end='')
for s in secs:
    print(f"  {s:>5}", end='')
print()
for sa in secs:
    print(f"  {SECTION_LABELS.get(sa,sa)[:28]:<28}", end='')
    for sb in secs:
        if sa == sb:
            print(f"  {'---':>5}", end='')
        else:
            d = jsd(sec_vocab[sa], sec_vocab[sb])
            print(f"  {d:.3f}", end='')
    print()

# ─── Test 2: Prefix dominance by section (iconographic alignment) ─────────────
print("\n" + "="*70)
print("TEST 2: PREFIX CLASS DOMINANCE BY SECTION")
print("="*70)
print("Tests whether prefix classifiers partition sections as the iconographic")
print("program predicts: qok/qol = body/aquatic; ch/sh = plant/terrestrial;")
print("ot- = stellar/timed; s- = record openers; d- = record closers.")
print()

sec_prefix = defaultdict(Counter)
for pg, words in page_lines.items():
    sec = pages_meta.get(pg, {}).get('I', '?')
    for w in words:
        sec_prefix[sec][prefix_of(w)] += 1

pfx_list = ['qok/qol', 'qo-', 'ch-', 'sh-', 'ot-', 'ok/ol', 'o-', 'd-', 's-', 'other']
print(f"  {'Section':<34}", end='')
for pfx in pfx_list:
    print(f"  {pfx[:7]:>7}", end='')
print()

for sec in sorted(sec_prefix):
    total = sum(sec_prefix[sec].values()) or 1
    print(f"  {SECTION_LABELS.get(sec,sec)[:34]:<34}", end='')
    for pfx in pfx_list:
        pct = 100 * sec_prefix[sec].get(pfx, 0) / total
        print(f"  {pct:>6.1f}%", end='')
    print()

# ─── Test 3: Iconographic program alignment ──────────────────────────────────
print("\n" + "="*70)
print("TEST 3: GALENIC ASSOCIATION ALIGNMENT")
print("="*70)
print("For each prefix class, compare its section distribution against the")
print("Galenic prediction (which section should dominate for each prefix type).")
print()

# Global prefix→section distribution
pfx_sec_dist = defaultdict(Counter)
for pg, words in page_lines.items():
    sec = pages_meta.get(pg, {}).get('I', '?')
    for w in words:
        pfx_sec_dist[prefix_of(w)][sec] += 1

print(f"  {'Prefix':<14}  {'top section':>18}  {'%':>5}  {'Galenic prediction':>30}")
for pfx, sec_cnt in sorted(pfx_sec_dist.items(), key=lambda x: -sum(x[1].values())):
    total = sum(sec_cnt.values()) or 1
    if total < 50:
        continue
    top_sec, top_n = sec_cnt.most_common(1)[0]
    top_pct = 100 * top_n / total
    galenic = GALENIC_ASSOCIATIONS.get(pfx, ('?', '?'))
    print(f"  {pfx:<14}  {SECTION_LABELS.get(top_sec,top_sec)[:18]:>18}  {top_pct:>4.0f}%  "
          f"{galenic[0][:30]:>30}")

# ─── Test 4: Biological section — bath-therapy structure ─────────────────────
print("\n" + "="*70)
print("TEST 4: BIOLOGICAL SECTION — HYDROTHERAPY STRUCTURE")
print("="*70)
print("The balneological section shows female figures in pools.")
print("Galenic bath therapy prescribed water temperature (cool = degree 1,")
print("warm = degree 2, hot = degree 3) and mineral additives.")
print("Test: does the biological section show more mid-range degree values")
print("(degrees 1-2) than extremes, consistent with prescribed temperature ranges?")
print()

def factor_op(w):
    return sum(len(m.group()) for m in re.finditer(r'e+', w))

bio_ops = [factor_op(w) for pg, words in page_lines.items()
           if pages_meta.get(pg,{}).get('I') == 'B'
           for w in words]
herb_ops = [factor_op(w) for pg, words in page_lines.items()
            if pages_meta.get(pg,{}).get('I') == 'H'
            for w in words]

def print_op_dist(ops, label):
    cnt = Counter(ops); n = len(ops) or 1
    print(f"  {label:<36} n={n:>5}  mean={sum(ops)/n:.3f}  "
          f"°0:{100*cnt.get(0,0)/n:.0f}%  °1:{100*cnt.get(1,0)/n:.0f}%  "
          f"°2:{100*cnt.get(2,0)/n:.0f}%  °3+:{100*cnt.get(3,0)/n:.0f}%")

for sec_code, label in sorted(SECTION_LABELS.items()):
    ops = [factor_op(w) for pg, words in page_lines.items()
           if pages_meta.get(pg,{}).get('I') == sec_code
           for w in words]
    if len(ops) >= 50:
        print_op_dist(ops, label[:36])

# ─── Test 5: Record openers and the medical prescription format ───────────────
print("\n" + "="*70)
print("TEST 5: RECORD OPENER TYPES BY SECTION")
print("="*70)
print("In a medieval prescription: 'Recipe [ingredient list] — contra [condition]'")
print("The opener word class (Class C1 from Part 11) marks the subject/condition.")
print("Test: do opener words (line-initial words) carry different prefix profiles")
print("in different sections, as predicted by different medical domains?")
print()

sec_openers = defaultdict(Counter)
lines_by_sec = defaultdict(list)
locus_re2 = re.compile(r'^<(f[0-9a-zA-Z]+)\.([^,>]+),\s*([@+=*~$&!])(\w+?)(\d*)>\s*(.*)$')
hdr_re2 = re.compile(r'^<(f[0-9a-zA-Z]+)>')
pages2, cur2 = {}, None
for raw in open(ZL3B, encoding='utf-8', errors='replace'):
    raw = raw.rstrip('\n')
    if not raw or raw.startswith('#'):
        continue
    m = hdr_re2.match(raw)
    if m and not locus_re2.match(raw):
        cur2 = m.group(1)
        pages2[cur2] = dict(re.findall(r'\$(\w)=(\w+)', raw))
        continue
    m = locus_re2.match(raw)
    if not m:
        continue
    page, _, _, ltype, _, text = m.groups()
    if not ltype.upper().startswith('P'):
        continue
    v = pages2.get(page, {})
    text = re.sub(r'<!.*?>', '', text); text = re.sub(r'<->','?',text)
    text = re.sub(r'<%>|<\$>|<@\w+>','',text); text = re.sub(r'@\d+;','?',text)
    for _ in range(4):
        text = re.sub(r'\[([^:\[\]]*):[^\[\]]*\]', r'\1', text)
    text = text.replace(',','.').replace('!','').replace('%','')
    ws = [w for w in text.split('.') if w and valid(w)]
    if ws:
        sec = v.get('I', '?')
        sec_openers[sec][prefix_of(ws[0])] += 1

print(f"  {'Section':<34}  {'s-':>5}  {'d-':>5}  {'ch-':>5}  {'sh-':>5}  {'qok':>5}  {'o-':>5}  {'ot-':>5}")
for sec in sorted(sec_openers):
    total = sum(sec_openers[sec].values()) or 1
    print(f"  {SECTION_LABELS.get(sec,sec)[:34]:<34}  "
          f"{100*sec_openers[sec].get('s-',0)/total:>4.0f}%  "
          f"{100*sec_openers[sec].get('d-',0)/total:>4.0f}%  "
          f"{100*sec_openers[sec].get('ch-',0)/total:>4.0f}%  "
          f"{100*sec_openers[sec].get('sh-',0)/total:>4.0f}%  "
          f"{100*sec_openers[sec].get('qok/qol',0)/total:>4.0f}%  "
          f"{100*sec_openers[sec].get('o-',0)/total:>4.0f}%  "
          f"{100*sec_openers[sec].get('ot-',0)/total:>4.0f}%")

# ─── Final summary ────────────────────────────────────────────────────────────
print("\n" + "="*70)
print("ICONOGRAPHIC ALIGNMENT SUMMARY")
print("="*70)
print("""
Medieval medical compendium structure mapped to Voynich sections:

  H (Herbal)          = Materia medica listings
                        ch-/sh- prefix dominance = terrestrial plant classifiers
                        Galenic 'dry herb' convention: warming and drying properties

  B (Biological)      = Hydrotherapy/gynecology prescriptions
                        qok-/qol- prefix dominance = aquatic/body-domain classifiers
                        Bath figures = patients undergoing prescribed thermal treatments
                        Operator profile = bath temperature degrees (mild/moderate/hot)

  S/T (Stars/Recipes, Text) = Compiled prescriptions and formulary text
                        Mixed prefix use = draws from all ingredient domains
                        Highest mean operator = most complex/potent formulas

  Z (Zodiac)          = Astrological timing and body-part correspondence
                        Zodiac signs govern body parts (Aries=head...Pisces=feet)
                        Nymphs = body-part patients receiving zodiac-timed treatments

  P (Pharmaceutical)  = Prepared/distilled compounds
                        Transformation from raw herb (H) to extract/preparation (P)
                        Expected: higher operator values than H (more refined)

This structural alignment does not require a linguistic plaintext — the
notation is self-contained, using illustrations as the 'dictionary' for
what each body-system prefix and degree value means. The 'key' the
investigation seeks (ARCHIVAL_SEARCH.md) may be a physician's commentary
or glossary that names the prefix-classes and explains the degree scale,
not a full word-for-word translation key.

Artifacts: iconographic_map.py; uses data/ZL3b-n.txt
""")