#!/usr/bin/env python3 """Statistical analysis of the Voynich manuscript (ZL3b transliteration, IVTFF format) against natural-language controls (Latin, English) and a random-gibberish control. Pure stdlib. Outputs a readable report to stdout and JSON to results.json. """ import re, math, json, random, sys from collections import Counter, defaultdict DATA = "/Users/arcandledger/taxdome/ancient-texts/data" # ---------------- IVTFF parsing ---------------- def parse_ivtff(path): pages = {} # page -> dict of $vars lines = [] # records: dict(page, lang, illust, hand, locus_type, par_first, words, raw) cur_page, cur_vars = None, {} page_hdr = re.compile(r'^<(f[0-9a-zA-Z]+)>\s*(?:)?') locus_re = re.compile(r'^<(f[0-9a-zA-Z]+)\.([^,>]+),\s*([@+=*~$&!])(\w+?)(\d*)>\s*(.*)$') for raw in open(path, encoding='utf-8', errors='replace'): raw = raw.rstrip('\n') if not raw or raw.startswith('#'): continue m = page_hdr.match(raw) if m and '.' not in m.group(1) + '': # page header like if not locus_re.match(raw): cur_page = m.group(1) cur_vars = dict(re.findall(r'\$(\w)=(\w+)', raw)) pages[cur_page] = cur_vars continue m = locus_re.match(raw) if not m: continue page, locnum, gen, ltype, lsub, text = m.groups() v = pages.get(page, {}) par_first = '<%>' in text # clean text t = text t = re.sub(r'', '', t) # inline comments t = re.sub(r'<->', '?', t) # illegible chunk t = re.sub(r'<%>|<\$>|<@\w+>', '', t) # paragraph markers t = re.sub(r'@\d+;', '?', t) # rare-glyph codes -> placeholder # alternate readings [a:b:...] -> first option for _ in range(4): t = re.sub(r'\[([^:\[\]]*):[^\[\]]*\]', r'\1', t) t = t.replace(',', '.') # uncertain space -> space t = re.sub(r'[!%]', '', t) # padding words = [w for w in t.split('.') if w] lines.append(dict(page=page, lang=v.get('L'), illust=v.get('I'), hand=v.get('H'), ltype=ltype, par_first=par_first, words=words)) return pages, lines # ---------------- helpers ---------------- def shannon(counter): n = sum(counter.values()) return -sum(c/n * math.log2(c/n) for c in counter.values()) def cond_entropy_chars(text): """h1 and h2 (conditional bigram entropy) over the char stream incl. space.""" uni, bi = Counter(text), Counter(zip(text, text[1:])) h1 = shannon(uni) n2 = sum(bi.values()) # H(X2|X1) = H(X1,X2) - H(X1) h12 = -sum(c/n2 * math.log2(c/n2) for c in bi.values()) return h1, h12 - shannon(Counter(text[:-1])) def zipf_slope(freqs, maxrank=1000): fr = sorted(freqs, reverse=True)[:maxrank] pts = [(math.log(r+1), math.log(f)) for r, f in enumerate(fr)] n = len(pts); sx = sum(x for x,_ in pts); sy = sum(y for _,y in pts) sxx = sum(x*x for x,_ in pts); sxy = sum(x*y for x,y in pts) return (n*sxy - sx*sy) / (n*sxx - sx*sx) def levenshtein(a, b, cap=None): if a == b: return 0 la, lb = len(a), len(b) if cap is not None and abs(la-lb) > cap: return cap+1 prev = list(range(lb+1)) for i in range(1, la+1): cur = [i] + [0]*lb for j in range(1, lb+1): cur[j] = min(prev[j]+1, cur[j-1]+1, prev[j-1] + (a[i-1]!=b[j-1])) prev = cur return prev[lb] def adjacent_similarity(tokens, seed=42): """Mean normalized edit distance of consecutive word pairs vs shuffled baseline.""" pairs = [(a,b) for a,b in zip(tokens, tokens[1:])] def mean_nd(ps): s = n = 0 for a,b in ps: s += levenshtein(a,b)/max(len(a),len(b)); n += 1 return s/n real = mean_nd(pairs) sh = tokens[:]; random.Random(seed).shuffle(sh) base = mean_nd(list(zip(sh, sh[1:]))) ident = sum(1 for a,b in pairs if a==b)/len(pairs) near = sum(1 for a,b in pairs if a!=b and levenshtein(a,b,2)<=1)/len(pairs) return real, base, ident, near def neighbor_connectivity(vocab): """Fraction of word types with >=1 neighbor at edit distance 1 (deletion-neighborhood method).""" buckets = defaultdict(set) vs = set(vocab) for w in vs: buckets[w].add(w) for i in range(len(w)): buckets[w[:i]+w[i+1:]].add(w) connected = 0; total_nb = 0 for w in vs: cand = set() keys = [w] + [w[:i]+w[i+1:] for i in range(len(w))] for k in keys: cand |= buckets.get(k, set()) nb = sum(1 for c in cand if c != w and levenshtein(c, w, 1) == 1) if nb: connected += 1 total_nb += nb return connected/len(vs), total_nb/len(vs) def word_stats(tokens): lens = Counter(len(w) for w in tokens) n = len(tokens) mean = sum(l*c for l,c in lens.items())/n var = sum((l-mean)**2*c for l,c in lens.items())/n dist = {l: lens.get(l,0)/n for l in range(1, 13)} return mean, math.sqrt(var), dist def tokenize_control(path, limit_chars=None): t = open(path, encoding='utf-8').read().lower() if limit_chars: t = t[:limit_chars] return re.findall(r'[a-z]+', t) # ---------------- load corpora ---------------- print("Parsing IVTFF ...", file=sys.stderr) pages, lines = parse_ivtff(f"{DATA}/ZL3b-n.txt") def valid(w): return re.fullmatch(r'[a-z]+', w) is not None # running text only (paragraph loci), labels separately text_lines = [l for l in lines if l['ltype'].upper().startswith('P')] label_lines = [l for l in lines if l['ltype'].upper().startswith('L')] all_tokens = [w for l in text_lines for w in l['words'] if valid(w)] tok_A = [w for l in text_lines if l['lang']=='A' for w in l['words'] if valid(w)] tok_B = [w for l in text_lines if l['lang']=='B' for w in l['words'] if valid(w)] lab_tokens = [w for l in label_lines for w in l['words'] if valid(w)] lat = tokenize_control(f"{DATA}/latin.txt") eng = tokenize_control(f"{DATA}/english.txt") # gibberish control: random words, same alphabet size & word-length distribution as Voynich rng = random.Random(7) vw_lens = [len(w) for w in all_tokens] alpha = sorted(set(''.join(all_tokens))) gib = [''.join(rng.choice(alpha) for _ in range(L)) for L in vw_lens] corpora = {'Voynich': all_tokens, 'Voynich-A': tok_A, 'Voynich-B': tok_B, 'Latin': lat, 'English': eng, 'Random': gib} R = {} print("Computing global stats ...", file=sys.stderr) for name, toks in corpora.items(): c = Counter(toks) text = ' '.join(toks) h1, h2 = cond_entropy_chars(text) mean, sd, dist = word_stats(toks) R[name] = dict(tokens=len(toks), types=len(c), alphabet=len(set(text))-1, ttr=len(c)/len(toks), zipf=zipf_slope(list(c.values())), h1=h1, h2=h2, wlen_mean=mean, wlen_sd=sd, wlen_dist=dist, top10=c.most_common(10), top100_cov=sum(f for _,f in c.most_common(100))/len(toks)) print("Adjacent-word similarity ...", file=sys.stderr) for name in ['Voynich', 'Latin', 'English', 'Random']: toks = corpora[name][:40000] real, base, ident, near = adjacent_similarity(toks) R[name].update(adj_real=real, adj_base=base, adj_ident=ident, adj_near=near) print("Neighbor connectivity ...", file=sys.stderr) nv = len(Counter(all_tokens)) for name in ['Voynich', 'Latin', 'English', 'Random']: vocab = [w for w,_ in Counter(corpora[name]).most_common(nv)] conn, mean_nb = neighbor_connectivity(vocab) R[name].update(net_conn=conn, net_mean_nb=mean_nb, net_vocab=len(vocab)) # ---------------- A vs B comparison ---------------- cA, cB = Counter(tok_A), Counter(tok_B) topA = set(w for w,_ in cA.most_common(100)); topB = set(w for w,_ in cB.most_common(100)) jacc = len(topA & topB)/len(topA | topB) sig = {} for w in ['daiin','chol','chor','shol','qokeedy','qokedy','chedy','shedy','qokain','qokaiin','ol','aiin','dy']: sig[w] = (cA.get(w,0)/max(1,len(tok_A))*1000, cB.get(w,0)/max(1,len(tok_B))*1000) # ---------------- line-position effects ---------------- def line_effects(lns): first_last = dict(first_len=[], mid_len=[], last_len=[], last_m=0, last_n=0, other_m=0, other_n=0, pf_par=0, par_words=0, pf_oth=0, oth_words=0, gallows_first_par=0, par_lines=0, gallows_first_oth=0, oth_lines=0) for l in lns: ws = [w for w in l['words'] if valid(w)] if len(ws) < 3: continue first_last['first_len'].append(len(ws[0])) first_last['last_len'].append(len(ws[-1])) first_last['mid_len'] += [len(w) for w in ws[1:-1]] first_last['last_m'] += ws[-1].endswith('m'); first_last['last_n'] += 1 for w in ws[:-1]: first_last['other_m'] += w.endswith('m'); first_last['other_n'] += 1 # p/f concentration in paragraph-initial lines key = ('pf_par','par_words') if l['par_first'] else ('pf_oth','oth_words') first_last[key[0]] += sum(1 for w in ws if 'p' in w or 'f' in w) first_last[key[1]] += len(ws) gk = ('gallows_first_par','par_lines') if l['par_first'] else ('gallows_first_oth','oth_lines') first_last[gk[0]] += ws[0][0] in 'tkpf' first_last[gk[1]] += 1 return first_last fl = line_effects(text_lines) # ---------------- output ---------------- def fmt(x): return f"{x:.3f}" if isinstance(x, float) else str(x) print("\n=== GLOBAL STATISTICS " + "="*50) cols = ['tokens','types','alphabet','ttr','zipf','h1','h2','wlen_mean','wlen_sd','top100_cov'] print(f"{'corpus':<11}" + ''.join(f"{c:>11}" for c in cols)) for name in corpora: print(f"{name:<11}" + ''.join(f"{fmt(R[name][c]):>11}" for c in cols)) print("\nWord-length distribution (% of tokens):") print(f"{'len':<11}" + ''.join(f"{l:>7}" for l in range(1,13))) for name in ['Voynich','Latin','English','Random']: d = R[name]['wlen_dist'] print(f"{name:<11}" + ''.join(f"{100*d[l]:>7.1f}" for l in range(1,13))) print("\n=== ADJACENT-WORD SIMILARITY (lower = more similar neighbours) " + "="*8) print(f"{'corpus':<11}{'real':>9}{'shuffled':>10}{'ratio':>8}{'P(same)':>9}{'P(dist<=1)':>11}") for name in ['Voynich','Latin','English','Random']: r = R[name] print(f"{name:<11}{r['adj_real']:>9.3f}{r['adj_base']:>10.3f}{r['adj_real']/r['adj_base']:>8.3f}" f"{r['adj_ident']:>9.4f}{r['adj_near']:>11.4f}") print("\n=== VOCABULARY SIMILARITY NETWORK (types with an edit-distance-1 neighbour) ===") for name in ['Voynich','Latin','English','Random']: r = R[name] print(f"{name:<11} vocab={r['net_vocab']:>6} connected={100*r['net_conn']:>5.1f}% mean#neighbours={r['net_mean_nb']:.2f}") print("\n=== CURRIER A vs B " + "="*50) print(f"A: {len(tok_A)} tokens, {len(cA)} types | B: {len(tok_B)} tokens, {len(cB)} types") print(f"Jaccard overlap of top-100 word lists: {jacc:.3f}") print(f"{'word':<10}{'A /1000':>10}{'B /1000':>10}") for w,(a,b) in sig.items(): print(f"{w:<10}{a:>10.2f}{b:>10.2f}") print("\n=== LINE-POSITION EFFECTS (running text) " + "="*30) import statistics as st print(f"mean word length: line-first={st.mean(fl['first_len']):.2f} mid={st.mean(fl['mid_len']):.2f} line-last={st.mean(fl['last_len']):.2f}") print(f"P(word ends in 'm'): line-final={fl['last_m']/fl['last_n']:.3f} elsewhere={fl['other_m']/fl['other_n']:.3f}") print(f"P(word contains p/f): paragraph-initial lines={fl['pf_par']/fl['par_words']:.3f} other lines={fl['pf_oth']/fl['oth_words']:.3f}") print(f"P(line starts with gallows t/k/p/f): par-initial={fl['gallows_first_par']/fl['par_lines']:.3f} other={fl['gallows_first_oth']/fl['oth_lines']:.3f}") print(f"\nLabels: {len(lab_tokens)} tokens, {len(set(lab_tokens))} types") print("Top Voynich words:", R['Voynich']['top10']) print("Top A:", cA.most_common(8)) print("Top B:", cB.most_common(8)) json.dump({k: {kk: vv for kk, vv in v.items() if kk != 'wlen_dist'} for k, v in R.items()}, open('/Users/arcandledger/taxdome/ancient-texts/results.json', 'w'), indent=1, default=str) print("\nresults.json written")