######################################################################
##RORTY: Save this file as RORTY. To use it, stay in the             #
##same directory, get into Maple (by typing: maple <Enter> )         #
##and then type:  read RORTY : <Enter>                               #
##Then follow the instructions given there                           #
##                                                                   #
##Written by Doron Zeilberger, Temple University ,                   #
#zeilberg@math.temple.edu.                                           # 
#######################################################################
 
#Created: Sept. 22, 1998
#This version: Sept. 22, 1998
#RORTY: A Maple package to study texts
#given by their text-profiles
#Must have an accompanying Data file, TEXTP
#a sample may be downloaded from Zeilberger's site
#Please report bugs to zeilberg@math.temple.edu
read TEXTP:
 
 
print(`Created: Sept. 23, 1998.`):
print(`This version: Sept. 23, 1998`):
lprint(``):
print(`Written by Doron Zeilberger, zeilberg@math.temple.edu`):
lprint(``):
print(`Please report bugs to zeilberg@math.temple.edu`):
lprint(``):
print(`Must have an accompanying Data Set, DataSet, in the same directory`):
print(`a sample may be downloaded from Zeilberger's site`):
 print(`The most current version of this  package and paper`):
 print(` are  available from`):
 print(`http://www.math.temple.edu/~zeilberg/`):
 print(`For a list of the procedures type ezra(), for help with`):
 print(`a specific procedure, type ezra(procedure_name)`):
 print(``):
ezra:=proc()
if args=NULL then
 print(`Contains the following procedures: AvZipf, Corpus`):
  print(`Entropy, FavesWords, FreqRank, Kosher, Milim`):
 print(`, MilimAd, RatFreq, TavlaFreq, TavlaKama, TavlaRank`):
  print(`Texts, TopWords, Zinn, Zipf`):
fi:
 
if nops([args])=1 and op(1,[args])=Entropy then
print(`Entropy(text1): The word-Entropy per-word of a text profile text1`):
print(`i.e. the expected log of frequency per log(N)`):
fi:
 
 
if nops([args])=1 and op(1,[args])=Corpus then
print(`Corpus(TextList): Given a list of text-profiles, combines`):
print(`them into one Corpus-profile, in terms of a table of `):
print(`number-of-occurences, followed by the set of all words `):
fi:
 
 
if nops([args])=1 and op(1,[args])=RatFreq then
print(`RatFreq(text1,text2): the list of ratios`):
print(`[freq(u[i])/freq(v[i]), where u[i] ( v[i] )`):
print(`are the i^th most frequent word in text1 (text2)`):
fi:
 
if nops([args])=1 and op(1,[args])=FavesWords then
print(`FavesWords(text1,text2,Rat,L): The list of words in text1`):
print(`whose frequency is at least R times that of the`):
print(`frequency of that word in text2, and that is among the top L`):
print(`frequent words of text1`):
fi:
 
 
if nops([args])=1 and op(1,[args])=`AvZipf` then
print(`AvZipf(text1,L): the average of i*freq[i] for the L`):
print(`most frequent words`):
fi:
 
if nops([args])=1 and op(1,[args])=`Zinn` then
 print(` Zinn's method `):
 print(`Given an increasing sequence a(n) of positive integers , expressed`):
 print(`in terms of a list, estimates the theta and mu such that `):
 print(` a(n) is asympt. to n^(theta)*mu^n. The output is `):
 print(` theta, mu `):
fi:
 
if nops([args])=1 and op(1,[args])=TavlaRank then
print(`TavlaRank(text1): Given a text-profile, outputs`):
print(`the table of ranking (by frequency)`):
fi:
 
if nops([args])=1 and op(1,[args])=TavlaFreq then
print(`TavlaFreq(text1): Given a text-profile, outputs`):
print(`the table of frequencies`):
fi:
 
if nops([args])=1 and op(1,[args])=TopWords then
print(`TopWords(text1,K): the list of the K most frquent words`):
print(`in text1, in decreasing order of frequency`):
fi:
 
 
if nops([args])=1 and op(1,[args])=MilimAd then
print(`MilimAd(text1,K): the set of words in the top-K list`):
fi:
 
if nops([args])=1 and op(1,[args])=Zipf then
print(`Zipf(text1): the lists of frequencies of words,`):
print(`from most frequent to least frequent times its rank`):
print(`should be roughly .1 by Zipf's law`):
print(`given in floating point`):
fi:
 
 
if nops([args])=1 and op(1,[args])=TavlaKama then
print(`TavlaKama(text1): Given a text-profile, outputs`):
print(`the table of number-of-occurrences`):
fi:
 
if nops([args])=1 and op(1,[args])=Kosher then
print(`Kosher(Text1): checks whether the text-profile is in`):
print(`the right format`):
fi:
 
if nops([args])=1 and op(1,[args])=FreqRank then
print(`FreqRank(text1): the lists of frequencies of words,`):
print(`from most frequent to least frequent`):
fi:
 
if nops([args])=1 and op(1,[args])=Milim then
print(`Milim(list): Given a list of pairs [word,number of occurences]`):
print(`extracts the first components, i.e. outputs `):
print(`the set of words in the text`):
fi:
 
 
if nops([args])=1 and op(1,[args])=Texts then
print(`Texts(): the set of available text-profiles, but with`):
print(` lower-case rendition`):
fi:
 
end:
 
Texts:=proc():{unabomber,starr,genesis,exodus,leviticus,numbers,deuternomy}:
end:
 
 
Milim:=proc(text1)
local gu,i:
option remember:
if not Kosher(text1) then
ERROR(`bad input`):
fi:
 
gu:={}:
for i from 1 to nops(text1) do
gu:=gu union {text1[i][1]}:
od:
gu:
end:
 
 
#TavlaKama(text1): Given a text-profile, outputs
#the table of number-of-occurrences
TavlaKama:=proc(text1)
local gu,i:
if not Kosher(text1) then
ERROR(`bad input`):
fi:
for i from 1 to nops(text1) do
gu[text1[i][1]]:=text1[i][2]:
od:
gu:
end:
 
 
#FreqRank(text1): the lists of frequencies of words,
#from most frequent to least frequent
FreqRank:=proc(text1)
local gu,i,su:
if not Kosher(text1) then
ERROR(`bad input`):
fi:
 
gu:=[seq(text1[nops(text1)-i+1][2],i=1..nops(text1))]:
su:=convert(gu,`+`):
[seq(gu[i]/su,i=1..nops(gu))]:
end:
 
 
#Kosher(Text1): checks whether the text-profile is in
#the right format
Kosher:=proc(Text1)
local lu,i:
for i from 1 to nops(Text1) do
lu:=op(i,Text1):
if not (type(lu,list) and nops(lu)=2) then
print(`The `, i,`th term`, lu, `does not have two elements`):
RETURN(false):
fi:
 
if not type(lu[1],string) then
print(`The `, i,`th term`, lu, `does not have its first entry a string`):
RETURN(false):
fi:
 
 
if not type(lu[2],integer) then
print(`The `, i,`th term`, lu, `does not have its second entry a string`):
RETURN(false):
fi:
od:
true:
end:
 
 
 
#Zipf(text1): the lists of frequencies of words,
#from most frequent to least frequent times its rank
#should be roughly constant  by Zipf's law
#given in floating point. The output should be roughly 1
Zipf:=proc(text1)
local gu,i,su:
if not Kosher(text1) then
ERROR(`bad input`):
fi:
 
gu:=[seq(text1[nops(text1)-i+1][2],i=1..nops(text1))]:
su:=convert(gu,`+`):
evalf([seq(i*gu[i]/su,i=1..nops(gu))]):
end:
 
 
 
#MilimAd(text1,K): the set of words in the top-K list
MilimAd:=proc(text1,K)
local gu,i:
if not Kosher(text1) then
ERROR(`bad input`):
fi:
 
gu:={}:
for i from nops(text1) by -1 to max(1,nops(text1)-K+1) do
gu:=gu union {text1[i][1]}:
od:
gu:
end:
 
 
#TopWords(text1,K): the list of the K most frquent words
#in text1, in decreasing order of frequency
TopWords:=proc(text1,K) local i:[seq(text1[nops(text1)-i+1],i=1..K)]:end:
 
 
#TavlaFreq(text1): Given a text-profile, outputs
#the table of frequencies
TavlaFreq:=proc(text1)
local su,mu,gu,i:
if not Kosher(text1) then
ERROR(`bad input`):
fi:
 
 
mu:=[seq(text1[nops(text1)-i+1][2],i=1..nops(text1))]:
su:=convert(mu,`+`):
 
for i from 1 to nops(text1) do
gu[text1[i][1]]:=text1[i][2]/su:
od:
gu:
end:
 
 
 
#TavlaRank(text1): Given a text-profile, outputs
#the table of ranking (by frequency)
TavlaRank:=proc(text1)
local gu,i:
if not Kosher(text1) then
ERROR(`bad input`):
fi:
 
 
for i from 1 to nops(text1) do
gu[text1[i][1]]:=nops(text1)-i+1:
od:
gu:
end:
 
 
 
#Corpus(TextList): Given a sequence of text-profiles, combines
#them into one Corpus-profile, in terms of a table of 
#number-of-occurences, followed by the set of all words 
Corpus:=proc(TextList)
local i,T,kv,gu:
 
if nargs=1 then
RETURN(args[1]):
fi:
gu:=args[1]:
 
for i from 2 to nargs do 
gu:=Khaber(gu,args[i]):
od:
 
gu:
end:
 
 
Zinn:=proc(resh,n)
local s1,s2:
s1:=sn(resh,n):
s2:=sn(resh,n-1):
evalf(2*(s1+s2)/(s1-s2)^2),
evalf(sqrt(op(n+1,resh)/op(n-1,resh))*exp(-(s1+s2)/((s1-s2)*s1))):
end:
 
 
sn:=proc(resh,n):
-1/log(op(n+1,resh)*op(n-1,resh)/op(n,resh)^2):
evalf("):
end:
 
 
 
#AvZipf(text1,L): the average of i*freq[i] for the L
#most frequent words
AvZipf:=proc(text1,L)
local gu:
gu:=[op(1..L,Zipf(text1))]:
convert(gu,`+`)/L:
end:
 
 
#FavesWords(text1,text2,Rat,L): The list of words in text1
#whose frequency is at least R times that of the
#frequency of that word in text2, and that is among the top L
#frequent words of text1
FavesWords:=proc(text1,text2,Rat,L)
local T1,T2,S1,S2,gu,i,w:
S1:=MilimAd(text1,L):
S2:=Milim(text2):
T1:=TavlaFreq(text1):
T2:=TavlaFreq(text2):
gu:={}:
for i from 1 to nops(S1) do
w:=op(i,S1):
if not member(w,S2) then
 gu:=gu union {w}:
elif T1[w]/T2[w]>Rat then
gu:=gu union {w}:
fi:
 
od:
 
gu:
 
end:
 
 
 
#RatFreq(text1,text2): the list of ratios
#[freq(u[i])/freq(v[i]), where u[i] ( v[i] )
#are the i^th most frequent word in text1 (text2)
RatFreq:=proc(text1,text2)
local gu1,gu2,i:
gu1:=FreqRank(text1):
gu2:=FreqRank(text2):
[seq(gu1[i]/gu2[i],i=1..min(nops(gu1),nops(gu2)))]:
end:
 
 
#Khaber(text1,text2): Given two text-profiles, text1, text2
#finds the profile of the merged text
 
Khaber:=proc(text1,text2)
local T1,T2,S1,S2,S12,S,i,w,T,TI,lu,nu,kv,j,hu:
S1:=Milim(text1):S2:=Milim(text2): S:=S1 union S2:
S12:=S1 intersect S2:
T1:=TavlaKama(text1):T2:=TavlaKama(text2):
for i from 1 to nops(S) do
w:=op(i,S):
if member(w,S12) then
T[w]:=T1[w]+T2[w]:
elif member(w,S1) then
T[w]:=T1[w]:
elif member(w,S2) then
T[w]:=T2[w]:
fi:
od:
T:
 
lu:=[]:
for i from 1 to nops(S) do
w:=op(i,S):
lu:=[op(lu),T[w]]:
od:
lu:=convert(lu,set):
lu:=convert(lu,list):
lu:=sort(lu):
for i from 1 to nops(lu) do
TI[op(i,lu)]:={}:
od:
for i from 1 to nops(S) do
w:=op(i,S):
TI[T[w]]:=TI[T[w]] union {w}:
od:
TI:
 
hu:=[]:
for i from 1 to nops(lu) do
nu:=op(i,lu):
kv:=TI[nu]:
 
for j from 1 to nops(kv) do
hu:=[op(hu),[op(j,kv),nu]]:
od:
od:
hu:
 
end:
 
 
#Entropy(text1): The word-Entropy per-word of a text profile text1
#i.e. the expected log of frequency per log(N)
Entropy:=proc(text1)
local gu,i,N,mu:
gu:=FreqRank(text1):
N:=nops(gu):
 
mu:=0:
for i from 1 to N do
 mu:=mu-evalf(gu[i]*log(gu[i])):
od:
 
evalf(mu/log(N)):
end: