#!/bin/bash
# Reconstructs ./data/ with the EXACT processed filenames the analysis scripts open.
# Sources are public; we fetch rather than redistribute. Requires: bash, curl, python3, git.
# Verified to produce every file referenced by canonical.py and the *_test.py scripts.
set -e
cd "$(dirname "$0")"
mkdir -p data
python3 - <<'PY'
import re, sys, os, subprocess
def get(url):
    r = subprocess.run(['curl','-sL','--max-time','90','-A','Mozilla/5.0',url],
                       capture_output=True)
    if r.returncode != 0 or not r.stdout:
        raise RuntimeError(f'fetch failed: {url}')
    return r.stdout.decode('utf-8','replace')
def gutenberg(eid):
    t = get(f"https://www.gutenberg.org/cache/epub/{eid}/pg{eid}.txt")
    s, e = t.find('*** START'), t.find('*** END')
    return t[t.find('\n', s)+1:e] if s>=0 and e>=0 else t
def words_only(t):  # collapse to single-spaced word stream
    return re.sub(r'\s+',' ', t).strip()
D='data'

# --- Voynich transliterations (primary + replication) ---
print('Voynich ZL3b (EVA) + GC (v101) ...', file=sys.stderr)
open(f'{D}/ZL3b-n.txt','w').write(get('http://www.voynich.nu/data/ZL3b-n.txt'))
open(f'{D}/GC2a-n.txt','w').write(get('http://www.voynich.nu/data/GC2a-n.txt'))

# --- English control: Alice + Dracula ---
print('English control ...', file=sys.stderr)
eng = words_only(gutenberg(11)) + ' ' + words_only(gutenberg(345))
open(f'{D}/english.txt','w').write(eng)

# --- Latin control: Caesar, De Bello Gallico I-VI (HTML stripped) ---
print('Latin control (Caesar) ...', file=sys.stderr)
raw=''
for i in range(1,7):
    try: raw += get(f"https://www.thelatinlibrary.com/caesar/gall{i}.shtml")
    except Exception: pass
raw = re.sub(r'<script.*?</script>','',raw,flags=re.S|re.I)
txt = re.sub(r'<[^>]+>',' ', raw)
import html as _h; txt=_h.unescape(txt)
txt = re.sub(r'\[\d+\]|\d+','',txt)
open(f'{D}/latin.txt','w').write(words_only(txt))

# --- Finnish (Heaps/chaining control) ---
print('Finnish control ...', file=sys.stderr)
open(f'{D}/fi_clean.txt','w').write(gutenberg(11940))

# --- Genre control: Digby recipe compendium (1669) ---
print('Recipe-genre control (Digby) ...', file=sys.stderr)
open(f'{D}/genre_digby.txt','w').write(gutenberg(16441))

# --- Typology screen corpora (script-filtered downstream by typology_test.py) ---
print('Typology corpora (hu, fi, cs, el, de) ...', file=sys.stderr)
typo = {'hu':34759, 'cs':37525}
for tag,eid in typo.items():
    try: open(f'{D}/typo_{tag}.txt','w').write(gutenberg(eid))
    except Exception as e: print('  skip',tag,e,file=sys.stderr)
# Greek: concatenate a few short PG texts
gel=''
for eid in (39536,17996,39208,39409):
    try: gel += gutenberg(eid)+'\n'
    except Exception: pass
open(f'{D}/typo_el.txt','w').write(gel)
# bible-corpus XML (Turkish, Farsi, Hebrew) -> stripped text
for tag,name in [('tr','Turkish'),('fa','Farsi'),('he','Hebrew')]:
    try:
        x = get(f"https://raw.githubusercontent.com/christos-c/bible-corpus/master/bibles/{name}.xml")
        open(f'{D}/typo_{tag}.txt','w').write(re.sub(r'<[^>]+>',' ',x))
    except Exception as e: print('  skip',tag,e,file=sys.stderr)
# Arabic: Quran JSON, dediacritized
try:
    import json
    q = json.loads(get("https://raw.githubusercontent.com/risan/quran-json/main/dist/quran.json"))
    verses=[v['text'] for ch in q for v in ch['verses']]
    body=re.sub(r'[ً-ْٰٖ-ٟۖ-ۭ]','',' '.join(verses))
    open(f'{D}/typo_ar.txt','w').write(body)
except Exception as e: print('  skip ar',e,file=sys.stderr)
# also provide de_clean/it/es used by typology (optional; typology_test skips missing)
for tag,eid in [('de',22367),('it',54452),('es',2000)]:
    try: open(f'{D}/{tag}_clean.txt' if tag=='de' else f'{D}/{tag}.txt','w').write(gutenberg(eid))
    except Exception as e: print('  skip',tag,e,file=sys.stderr)
print('Text corpora done.', file=sys.stderr)
PY

# --- Linear A and Indus corpora (cloned) ---
echo "Linear A + Indus corpora ..." >&2
[ -d lineara.xyz ] || git clone --depth 1 -q https://github.com/mwenge/lineara.xyz || true
[ -d indus-valley-script-corpus ] || git clone --depth 1 -q https://github.com/mayig/indus-valley-script-corpus || true

echo ""
echo "Done. Verifying the files the scripts require exist:"
for f in ZL3b-n.txt GC2a-n.txt english.txt latin.txt fi_clean.txt genre_digby.txt; do
  if [ -s "data/$f" ]; then echo "  OK   data/$f"; else echo "  MISSING data/$f"; fi
done
echo "Now run:  python3 canonical.py"
