###################################################################### ##RORTY: Save this file as RORTY. To use it, stay in the # ##same directory, get into Maple (by typing: maple ) # ##and then type: read RORTY : # ##Then follow the instructions given there # ## # ##Written by Doron Zeilberger, Temple University , # #zeilberg@math.temple.edu. # ####################################################################### #Created: Sept. 22, 1998 #This version: Sept. 22, 1998 #RORTY: A Maple package to study texts #given by their text-profiles #Must have an accompanying Data file, TEXTP #a sample may be downloaded from Zeilberger's site #Please report bugs to zeilberg@math.temple.edu read TEXTP: print(`Created: Sept. 23, 1998.`): print(`This version: Sept. 23, 1998`): lprint(``): print(`Written by Doron Zeilberger, zeilberg@math.temple.edu`): lprint(``): print(`Please report bugs to zeilberg@math.temple.edu`): lprint(``): print(`Must have an accompanying Data Set, DataSet, in the same directory`): print(`a sample may be downloaded from Zeilberger's site`): print(`The most current version of this package and paper`): print(` are available from`): print(`http://www.math.temple.edu/~zeilberg/`): print(`For a list of the procedures type ezra(), for help with`): print(`a specific procedure, type ezra(procedure_name)`): print(``): ezra:=proc() if args=NULL then print(`Contains the following procedures: AvZipf, Corpus`): print(`Entropy, FavesWords, FreqRank, Kosher, Milim`): print(`, MilimAd, RatFreq, TavlaFreq, TavlaKama, TavlaRank`): print(`Texts, TopWords, Zinn, Zipf`): fi: if nops([args])=1 and op(1,[args])=Entropy then print(`Entropy(text1): The word-Entropy per-word of a text profile text1`): print(`i.e. the expected log of frequency per log(N)`): fi: if nops([args])=1 and op(1,[args])=Corpus then print(`Corpus(TextList): Given a list of text-profiles, combines`): print(`them into one Corpus-profile, in terms of a table of `): print(`number-of-occurences, followed by the set of all words `): fi: if nops([args])=1 and op(1,[args])=RatFreq then print(`RatFreq(text1,text2): the list of ratios`): print(`[freq(u[i])/freq(v[i]), where u[i] ( v[i] )`): print(`are the i^th most frequent word in text1 (text2)`): fi: if nops([args])=1 and op(1,[args])=FavesWords then print(`FavesWords(text1,text2,Rat,L): The list of words in text1`): print(`whose frequency is at least R times that of the`): print(`frequency of that word in text2, and that is among the top L`): print(`frequent words of text1`): fi: if nops([args])=1 and op(1,[args])=`AvZipf` then print(`AvZipf(text1,L): the average of i*freq[i] for the L`): print(`most frequent words`): fi: if nops([args])=1 and op(1,[args])=`Zinn` then print(` Zinn's method `): print(`Given an increasing sequence a(n) of positive integers , expressed`): print(`in terms of a list, estimates the theta and mu such that `): print(` a(n) is asympt. to n^(theta)*mu^n. The output is `): print(` theta, mu `): fi: if nops([args])=1 and op(1,[args])=TavlaRank then print(`TavlaRank(text1): Given a text-profile, outputs`): print(`the table of ranking (by frequency)`): fi: if nops([args])=1 and op(1,[args])=TavlaFreq then print(`TavlaFreq(text1): Given a text-profile, outputs`): print(`the table of frequencies`): fi: if nops([args])=1 and op(1,[args])=TopWords then print(`TopWords(text1,K): the list of the K most frquent words`): print(`in text1, in decreasing order of frequency`): fi: if nops([args])=1 and op(1,[args])=MilimAd then print(`MilimAd(text1,K): the set of words in the top-K list`): fi: if nops([args])=1 and op(1,[args])=Zipf then print(`Zipf(text1): the lists of frequencies of words,`): print(`from most frequent to least frequent times its rank`): print(`should be roughly .1 by Zipf's law`): print(`given in floating point`): fi: if nops([args])=1 and op(1,[args])=TavlaKama then print(`TavlaKama(text1): Given a text-profile, outputs`): print(`the table of number-of-occurrences`): fi: if nops([args])=1 and op(1,[args])=Kosher then print(`Kosher(Text1): checks whether the text-profile is in`): print(`the right format`): fi: if nops([args])=1 and op(1,[args])=FreqRank then print(`FreqRank(text1): the lists of frequencies of words,`): print(`from most frequent to least frequent`): fi: if nops([args])=1 and op(1,[args])=Milim then print(`Milim(list): Given a list of pairs [word,number of occurences]`): print(`extracts the first components, i.e. outputs `): print(`the set of words in the text`): fi: if nops([args])=1 and op(1,[args])=Texts then print(`Texts(): the set of available text-profiles, but with`): print(` lower-case rendition`): fi: end: Texts:=proc():{unabomber,starr,genesis,exodus,leviticus,numbers,deuternomy}: end: Milim:=proc(text1) local gu,i: option remember: if not Kosher(text1) then ERROR(`bad input`): fi: gu:={}: for i from 1 to nops(text1) do gu:=gu union {text1[i][1]}: od: gu: end: #TavlaKama(text1): Given a text-profile, outputs #the table of number-of-occurrences TavlaKama:=proc(text1) local gu,i: if not Kosher(text1) then ERROR(`bad input`): fi: for i from 1 to nops(text1) do gu[text1[i][1]]:=text1[i][2]: od: gu: end: #FreqRank(text1): the lists of frequencies of words, #from most frequent to least frequent FreqRank:=proc(text1) local gu,i,su: if not Kosher(text1) then ERROR(`bad input`): fi: gu:=[seq(text1[nops(text1)-i+1][2],i=1..nops(text1))]: su:=convert(gu,`+`): [seq(gu[i]/su,i=1..nops(gu))]: end: #Kosher(Text1): checks whether the text-profile is in #the right format Kosher:=proc(Text1) local lu,i: for i from 1 to nops(Text1) do lu:=op(i,Text1): if not (type(lu,list) and nops(lu)=2) then print(`The `, i,`th term`, lu, `does not have two elements`): RETURN(false): fi: if not type(lu[1],string) then print(`The `, i,`th term`, lu, `does not have its first entry a string`): RETURN(false): fi: if not type(lu[2],integer) then print(`The `, i,`th term`, lu, `does not have its second entry a string`): RETURN(false): fi: od: true: end: #Zipf(text1): the lists of frequencies of words, #from most frequent to least frequent times its rank #should be roughly constant by Zipf's law #given in floating point. The output should be roughly 1 Zipf:=proc(text1) local gu,i,su: if not Kosher(text1) then ERROR(`bad input`): fi: gu:=[seq(text1[nops(text1)-i+1][2],i=1..nops(text1))]: su:=convert(gu,`+`): evalf([seq(i*gu[i]/su,i=1..nops(gu))]): end: #MilimAd(text1,K): the set of words in the top-K list MilimAd:=proc(text1,K) local gu,i: if not Kosher(text1) then ERROR(`bad input`): fi: gu:={}: for i from nops(text1) by -1 to max(1,nops(text1)-K+1) do gu:=gu union {text1[i][1]}: od: gu: end: #TopWords(text1,K): the list of the K most frquent words #in text1, in decreasing order of frequency TopWords:=proc(text1,K) local i:[seq(text1[nops(text1)-i+1],i=1..K)]:end: #TavlaFreq(text1): Given a text-profile, outputs #the table of frequencies TavlaFreq:=proc(text1) local su,mu,gu,i: if not Kosher(text1) then ERROR(`bad input`): fi: mu:=[seq(text1[nops(text1)-i+1][2],i=1..nops(text1))]: su:=convert(mu,`+`): for i from 1 to nops(text1) do gu[text1[i][1]]:=text1[i][2]/su: od: gu: end: #TavlaRank(text1): Given a text-profile, outputs #the table of ranking (by frequency) TavlaRank:=proc(text1) local gu,i: if not Kosher(text1) then ERROR(`bad input`): fi: for i from 1 to nops(text1) do gu[text1[i][1]]:=nops(text1)-i+1: od: gu: end: #Corpus(TextList): Given a sequence of text-profiles, combines #them into one Corpus-profile, in terms of a table of #number-of-occurences, followed by the set of all words Corpus:=proc(TextList) local i,T,kv,gu: if nargs=1 then RETURN(args[1]): fi: gu:=args[1]: for i from 2 to nargs do gu:=Khaber(gu,args[i]): od: gu: end: Zinn:=proc(resh,n) local s1,s2: s1:=sn(resh,n): s2:=sn(resh,n-1): evalf(2*(s1+s2)/(s1-s2)^2), evalf(sqrt(op(n+1,resh)/op(n-1,resh))*exp(-(s1+s2)/((s1-s2)*s1))): end: sn:=proc(resh,n): -1/log(op(n+1,resh)*op(n-1,resh)/op(n,resh)^2): evalf("): end: #AvZipf(text1,L): the average of i*freq[i] for the L #most frequent words AvZipf:=proc(text1,L) local gu: gu:=[op(1..L,Zipf(text1))]: convert(gu,`+`)/L: end: #FavesWords(text1,text2,Rat,L): The list of words in text1 #whose frequency is at least R times that of the #frequency of that word in text2, and that is among the top L #frequent words of text1 FavesWords:=proc(text1,text2,Rat,L) local T1,T2,S1,S2,gu,i,w: S1:=MilimAd(text1,L): S2:=Milim(text2): T1:=TavlaFreq(text1): T2:=TavlaFreq(text2): gu:={}: for i from 1 to nops(S1) do w:=op(i,S1): if not member(w,S2) then gu:=gu union {w}: elif T1[w]/T2[w]>Rat then gu:=gu union {w}: fi: od: gu: end: #RatFreq(text1,text2): the list of ratios #[freq(u[i])/freq(v[i]), where u[i] ( v[i] ) #are the i^th most frequent word in text1 (text2) RatFreq:=proc(text1,text2) local gu1,gu2,i: gu1:=FreqRank(text1): gu2:=FreqRank(text2): [seq(gu1[i]/gu2[i],i=1..min(nops(gu1),nops(gu2)))]: end: #Khaber(text1,text2): Given two text-profiles, text1, text2 #finds the profile of the merged text Khaber:=proc(text1,text2) local T1,T2,S1,S2,S12,S,i,w,T,TI,lu,nu,kv,j,hu: S1:=Milim(text1):S2:=Milim(text2): S:=S1 union S2: S12:=S1 intersect S2: T1:=TavlaKama(text1):T2:=TavlaKama(text2): for i from 1 to nops(S) do w:=op(i,S): if member(w,S12) then T[w]:=T1[w]+T2[w]: elif member(w,S1) then T[w]:=T1[w]: elif member(w,S2) then T[w]:=T2[w]: fi: od: T: lu:=[]: for i from 1 to nops(S) do w:=op(i,S): lu:=[op(lu),T[w]]: od: lu:=convert(lu,set): lu:=convert(lu,list): lu:=sort(lu): for i from 1 to nops(lu) do TI[op(i,lu)]:={}: od: for i from 1 to nops(S) do w:=op(i,S): TI[T[w]]:=TI[T[w]] union {w}: od: TI: hu:=[]: for i from 1 to nops(lu) do nu:=op(i,lu): kv:=TI[nu]: for j from 1 to nops(kv) do hu:=[op(hu),[op(j,kv),nu]]: od: od: hu: end: #Entropy(text1): The word-Entropy per-word of a text profile text1 #i.e. the expected log of frequency per log(N) Entropy:=proc(text1) local gu,i,N,mu: gu:=FreqRank(text1): N:=nops(gu): mu:=0: for i from 1 to N do mu:=mu-evalf(gu[i]*log(gu[i])): od: evalf(mu/log(N)): end: