#!/bin/bash
# MIRROR FALLBACK for fetch_data.sh — for environments where voynich.nu,
# gutenberg.org and thelatinlibrary.com are unreachable (e.g. proxied CI).
# Produces the SAME processed filenames in ./data/ from public GitHub mirrors.
# fetch_data.sh remains the canonical provenance record; prefer it when the
# primary hosts are reachable. Differences vs primary sources are recorded in
# REPORT.md (reproduction part).
set -e
cd "$(dirname "$0")"
mkdir -p data
python3 - <<'PY'
import re, sys, subprocess, html as _h

def get(url):
    r = subprocess.run(['curl','-sL','--max-time','120','-A','Mozilla/5.0',url],
                       capture_output=True)
    if r.returncode != 0 or not r.stdout:
        raise RuntimeError(f'fetch failed: {url}')
    body = r.stdout.decode('utf-8','replace')
    if len(body) < 100 and 'Not Found' in body:
        raise RuntimeError(f'404: {url}')
    return body

def strip_pg(t):  # Project Gutenberg header/footer stripper (same as fetch_data.sh)
    s, e = t.find('*** START'), t.find('*** END')
    return t[t.find('\n', s)+1:e] if s>=0 and e>=0 else t

def words_only(t):
    return re.sub(r'\s+',' ', t).strip()

D='data'
RAW='https://raw.githubusercontent.com'

# --- Voynich ZL3b (EVA), pinned mirror of voynich.nu/data/ZL3b-n.txt v3b 13/05/2025 ---
print('Voynich ZL3b (mirror) ...', file=sys.stderr)
open(f'{D}/ZL3b-n.txt','w').write(get(
  f'{RAW}/Pantani/voynich/5df3bf95f9ca14b3ec711a86ef15f99e4c0e9a0c/voynich-codex-project/data/raw/ZL3b-n.txt'))

# --- Latin control: Caesar, De Bello Gallico I-VI (CLTK Latin Library mirror) ---
print('Latin control (Caesar, cltk mirror) ...', file=sys.stderr)
raw=''
for i in range(1,7):
    try: raw += get(f'{RAW}/cltk/lat_text_latin_library/master/caesar/gall{i}.txt')+'\n'
    except Exception as e: print('  skip gall',i,e,file=sys.stderr)
raw = re.sub(r'\[\d+\]|\d+','',raw)
raw = re.sub(r'Caesar\s*$|The Latin Library|The Classics Page','',raw,flags=re.M)
open(f'{D}/latin.txt','w').write(words_only(raw))

# --- English control: Alice + Dracula (GITenberg mirrors) ---
print('English control (GITenberg) ...', file=sys.stderr)
eng=''
for repo,eid in [('Alice-s-Adventures-in-Wonderland_11',11),('Dracula_345',345)]:
    try: eng += words_only(strip_pg(get(f'{RAW}/GITenberg/{repo}/master/{eid}.txt')))+' '
    except Exception as e: print('  skip',repo,e,file=sys.stderr)
open(f'{D}/english.txt','w').write(eng.strip())

# --- Recipe-genre control: Digby 1669 (GITenberg mirror) ---
print('Genre control (Digby, GITenberg) ...', file=sys.stderr)
try:
    open(f'{D}/genre_digby.txt','w').write(strip_pg(get(
      f'{RAW}/GITenberg/The-Closet-of-Sir-Kenelm-Digby-Knight-Opened_16441/master/16441.txt')))
except Exception as e: print('  skip digby',e,file=sys.stderr)

# --- Finnish (Kalevala via GITenberg; fallback bible-corpus) ---
print('Finnish control ...', file=sys.stderr)
fi=''
try:
    fi = strip_pg(get(f'{RAW}/GITenberg/Kalevala_11940/master/11940.txt'))
except Exception:
    try:
        x = get(f'{RAW}/christos-c/bible-corpus/master/bibles/Finnish.xml')
        fi = re.sub(r'<[^>]+>',' ',x)
        print('  (substituted bible-corpus Finnish)', file=sys.stderr)
    except Exception as e: print('  skip fi',e,file=sys.stderr)
open(f'{D}/fi_clean.txt','w').write(fi)

# --- bible-corpus XML (Turkish, Farsi, Hebrew, Hungarian, Greek, Czech, German) ---
print('Typology corpora (bible-corpus) ...', file=sys.stderr)
for tag,name in [('tr','Turkish'),('fa','Farsi'),('he','Hebrew'),('hu','Hungarian'),
                 ('el','Greek'),('cs','Czech')]:
    try:
        x = get(f'{RAW}/christos-c/bible-corpus/master/bibles/{name}.xml')
        open(f'{D}/typo_{tag}.txt','w').write(re.sub(r'<[^>]+>',' ',x))
    except Exception as e: print('  skip',tag,e,file=sys.stderr)
try:
    x = get(f'{RAW}/christos-c/bible-corpus/master/bibles/German.xml')
    open(f'{D}/de_clean.txt','w').write(re.sub(r'<[^>]+>',' ',x))
except Exception as e: print('  skip de',e,file=sys.stderr)

print('Done.', file=sys.stderr)
PY

echo ""
echo "Verifying files:"
for f in ZL3b-n.txt latin.txt english.txt fi_clean.txt typo_tr.txt; do
  if [ -s "data/$f" ]; then echo "  OK   data/$f ($(wc -c < data/$f) bytes)"; else echo "  MISSING data/$f"; fi
done
