# Final Project -- Spring 2024 # Statistics of English Language # Members: Pablo Blanco, Aurora Hiveley, Kaylee Weatherspoon # Due date: May 1, 2024 at 11 PM read(`ENGLISH.txt`): # this file should contain all english words as a list of lists randomize(): Help:= proc(): if args=NULL then print(`The supported procedures are: generateWord, generateWordRandom, `): print(`generateWordRandWted, generateWordSt,MakeTrM, Swap, DevTools`): print(`Try, for example: Help(MakeTrM)`): else: HelpHelper(args): fi: end: HelpHelper:=proc(): if nargs<>1 then: print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`): print(`Too many arguments!`): print(`<><><><><><><><><><><><><><><><><><><><><><>`): return(): elif args[1]=generateWord then print(`******************************************************************************************`): print(`generateWord(K,lt): generates a word of length K using the transition matrix MakeTrM(K),`): print(`which begins with letter lt. It generates the word deterministically letter-by-letter by`): print(`choosing the most likely subsequent letter. Example:`): print(`generateWord(7,o); `): print(`will return [o, n, g, e, r, e, r]. If generateWord generates a word which is already in`): print(`ENGLISH.txt, then it will return FAIL instead.`): print(`******************************************************************************************`): elif args[1]=generateWordRandom then print(`******************************************************************************************`): print(`generateWordRandom(K,lt,T): similar to generateWord, but instead of picking the most common`): print(`successor, it selects from the T most common successors uniformly at random. For example, `): print(`generateWordRandom(6,c,5);`): print(`MAY sometimes return [c, e, n, s, e, n]. Upon generating a word already in ENGLISH.txt, the`): print(`procedure returns FAIL instead.`): print(`******************************************************************************************`): elif args[1]=generateWordRandWted then print(`******************************************************************************************`): print(`generateWordRandWted(K,lt,T): similar to generateWordRandom, but the T most common successors`): print(`are chosen with weights corresponding to their transition probability. For example,`): print(`generateWordRandWted(5,h,20);`): print(`MAY sometimes return [h, o, c, h, a]. Upon generating a word already in ENGLISH.txt, the`): print(`procedure returns FAIL instead.`): print(`******************************************************************************************`): elif args[1]=generateWordSt then print(`******************************************************************************************`): print(`generateWordSt(K): generate a word letter-by-letter, using the transition table generated`): print(`by MakeTrM(K,K). Try: generateWordSt(2).`): print(`******************************************************************************************`): elif args[1]=MakeTrM then print(`******************************************************************************************`): print(`MakeTrM(K): returns a 28x28 transition probability matrix by using data in ENGLISH.txt.`): print(`In particular, the procedure only considers words of length at least K.`): print(`Returns this matrix as a list of lists, where the n-th row corresponds to the n-th letter`): print(`of the alphabet. The first and last rows correspond to the \"start\" and \"end\" states,`): print(`respectively. For example, try: `): print(`MakeTrM(3)[2,3]`): print(`to obtain the transition probability from \"a\" to \"b\" in words of length at least 3.`): print(`******************************************************************************************`): print(`MakeTrM(K,N): returns a table which encodes transition probabilities from an N-letter`): print(`state to another letter. User must input K>=N. For N = 1, the output will be the same`): print(`as MakeTrM(K). For N >= 2, the output will be a table T with indices of the form [a1,...,aN],j`): print(`where a1,..,aN,j are the alphabetic indices of the corresponding letters. For example,`): print(`T:=MakeTrM(3,2):`): print(`T[[8,9],16];`): print(`the second command will return the probability that given the sequence [h,i], that`): print(`the next letter will be \"p\". Note that [h,i,p] has alphabetic indices [8,9,16].`): print(` In this example, T has 2-letter states and takes as input words of length at least 3.`): print(`******************************************************************************************`): elif args[1]=Swap then print(`******************************************************************************************`): print(`Swap(LETTER): given a letter, given as a string, tells you its index in the alphabet. Try`): print(`Swap("h")`): print(`to see that h is the 8th letter of the alphabet.`): print(`******************************************************************************************`): elif args[1]=DevTools then print(`******************************************************************************************`): print(`DevTools(): the Help command but for developer procedures. Try it!`): print(`******************************************************************************************`): else: print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`): print(`Help() does not support the following procedure:`): print(args): print(`<><><><><><><><><><><><><><><><><><><><><><>`): return(): fi: end: # This is like Help() but for procedures only intended for development. DevTools:=proc(): if args=NULL then print(`The supported procedures are: alphaToNum,ConstrSeq, ConvertInd, ModifMtx,weightedRand,weightedRand2`): else: DevTools1(args): fi: end: DevTools1:=proc(): if nargs<>1 then: print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`): print(`Too many arguments!`): print(`<><><><><><><><><><><><><><><><><><><><><><>`): return(): elif args[1]=alphaToNum then print(`******************************************************************************************`): print(`alphaToNum(W): given a word W as a sequence of letters, translate its alphabetic index. Try:`): print(`alphaToNum([c,a,t]);`): print(`which returns [3,1,20] and note that c,a,t have alphabetic indices 3,1,20, respectively.`): print(`******************************************************************************************`): elif args[1]=numToAlpha then print(`******************************************************************************************`): print(`numToAlpha(W): the inverse of alphaToNum.`): print(`******************************************************************************************`): elif args[1]=ConstrSeq then print(`******************************************************************************************`): print(`ConstrSeq(n,m): returns a set of all m-tuples such that all coordinates are in {1,..,n}`): print(`with the first coordinate also allowed to be 0.`): print(`******************************************************************************************`): elif args[1]=weightedRand then print(`******************************************************************************************`): print(`weightedRand(L): given a list L of probabilities, expressed as rational numbers, returns`): print(`said entry with the given probability. For example,`): print(`weightedRand([1/6,2/6,3/6])`): print(`will return 1/6 with probability 1/6, 2/6 with probability 2/6, and so on.`): print(`******************************************************************************************`): elif args[1]=weightedRand2 then print(`weightedRand2(L): given a list L of probabilities, expressed as rational numbers, returns`): print(`an index P with probability L[P]. For example,`): print(`weightedRand([3/6,2/6,1/6])`): print(`will return 1 with probability 3/6, 2 with probability 2/6, and so on.`): elif args[1]=ConvertInd then print(`******************************************************************************************`): print(`ConvertInd(M): Given a list of lists, converts its elements (words) into numerical`): print(`sequences corresponding to their alphabetically-indexed counterpart. Example:`): print(`ConvertInd([[t,a,c,o],[c,a,t]])`): print(`returns [[20, 1, 3, 15], [3, 1, 20]] because \"t\" is the 20th letter of the alphabet and`): print(`so on...`): print(`******************************************************************************************`): elif args[1]=ModifMtx then print(`******************************************************************************************`): print(`ModifMtx(k,L): Given a positive integer k and a list of matrices (lists of lists) L,`): print(`this procedure returns a list of the ROWS of length at least k. Example:`): print(`ModifMtx(2,[ [[1]], [[2,2],[3,3]] ])`): print(`will return [[2,2],[3,3]]`): print(`******************************************************************************************`): else: print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`): print(`Help() does not support the following procedure:`): print(args): print(`<><><><><><><><><><><><><><><><><><><><><><>`): return(): fi: end: # Note: the given english file contains some sketchy two letter words # For instance "[c,l]". I would like to ignore these. # MakeTrM(k1): constructs a transition matrix using all english words of length >= k. State size = 1 # Returns a list of lists with the first column/row denoting "start" and the last row (28) denoting "end" # MakeTrM(k1,2): returns a table T instead. T[["start","a"]] will return the probability that a word begins with "a" # # MakeTrM(k1, stSize): # MakeTrM:=proc(k1,stSize:=1) local E,F,n,m,i,j,b1,b2,Counts,colTotal,T,L,Tbl,States,st,W,Wpr: option remember: # Only consider words of length >=k if k1 = 0 or k1 = 1 then E:=[seq(op(L),L in ENG())]: else: E:=ModifMtx(k1, ENG()): #E:=TruncMtx(k1, [seq(op(s),s in ENG())]): fi: E:=ConvertInd(E): # convert all words into number lists. this is done to preserve data and for convenience. n:=nops(E): # number of words ### if stSize=1 then: Counts:=[[0$28]$28]: # number of transitions colTotal:=[[0]$28]: # number of appearances colTotal[1][1]:=n: # every word has a start and an end colTotal[28][1]:=n: for i from 1 to n do: Counts[1][E[i][1]+1]:=Counts[1][E[i][1]+1]+1: # the first row is dedicated to "start" m:=nops(E[i]): # word length for j from 1 to m-1 do: colTotal[E[i][j]+1][1]:= colTotal[E[i][j]+1][1]+1: # letter E[i][j]+1 has appeared! note: "a" is colmn 2 but 1 in E. # terrible to read, I know. update transition count for j to j+1th letter in the ith word: Counts[E[i][j]+1][E[i][j+1]+1]:=Counts[E[i][j]+1][E[i][j+1]+1] + 1: od: Counts[E[i][m]+1][28]:=Counts[E[i][m]+1][28]+1: # update "end" count colTotal[E[i][m]+1][1]:= colTotal[E[i][m]+1][1]+1: od: T:= [seq([seq(Counts[j][i]/max(colTotal[j][1],1),i=1..28)],j=1..28)]: # transition matrix! return(T): fi: ## if k1 < stSize then: print(`state size argument must be smaller than minimum word length!`): return(FAIL): fi: # for now, use numbers as table indices. Tbl:=InitMtx(26,stSize): States:=ConstrSeq(26,stSize): for st in States do: Counts[st]:=0: od: #Counts[0]:= n: # start has exactly this many transitions total. # Count all transitions: for W in E do: Wpr:= [0,op(W), 27]: # this lines up better with our table indices b1:=1: b2:=b1+stSize-1: while Wpr[b2]<>27 do: Tbl[ [op(b1..b2,Wpr)], Wpr[b2+1]]:= Tbl[ [op(b1..b2,Wpr)], Wpr[b2+1]] + 1: Counts[[op(b1..b2,Wpr)]]:=Counts[[op(b1..b2,Wpr)]]+1: b1:=b1+1: b2:=b1+stSize-1: od: od: for st in States do: for i from 0 to 27 do: Tbl[st,i] := Tbl[st,i]/max(Counts[st],1): od: od: return(op(Tbl)): end: # given a list of lists, removes all lists with fewer than k elements. TruncMtx := proc(k,M) local n,S,s,i: n:=nops(M): S:=[]: for i from 1 to n do: if nops(M[i])>=k then: S:=[op(S),i]: fi: od: [seq(M[s],s in S)]: end: # given a list of square matrices, consider only matrices with row length at least k. # return a list of the rows of such matrices. ModifMtx := proc(k,M) local m,S: S:=[]: for m in M do: if nops(m)>=1 and nops(m[1])>=k then: S:=[op(S), op(m)]: fi: od: S: end: # converts the letters in (a list of lists) L into their alphabet index. ConvertInd := proc(L) local alpha,R,i1,j1: alpha:=table([a=1,b=2,c=3,d=4,e=5,f=6,g=7,h=8,i=9,j=10,k=11,l=12,m=13,n=14,o=15,p=16,q=17,r=18,s=19,t=20,u=21,v=22,w=23,x=24,y=25,z=26]): R:=[seq([seq( alpha[L[i1][j1]], j1=1..nops(L[i1]))],i1=1..nops(L))]: end: # changes ones letter STRING to its alphabetic index. Do NOT put into ConvertInd. It breaks it? not sure if it still does. # example: Swap("c") returns 3. Swap:=proc(x1) local alpha: alpha:=table(["a"=1,"b"=2,"c"=3,"d"=4,"e"=5,"f"=6,"g"=7,"h"=8,"i"=9,"j"=10,"k"=11,"l"=12,"m"=13,"n"=14,"o"=15,"p"=16,"q"=17,"r"=18,"s"=19,"t"=20,"u"=21,"v"=22,"w"=23,"x"=24,"y"=25,"z"=26]): alpha[x1]: end: # initializes a transition matrix/table for an alphabet of length n and m-letter states. 0 always denotes "start" and n+1 denotes "end" InitMtx:=proc(n,m) local T,S,s,t: T:=table([]): S:=ConstrSeq(n,m): for s in S do: for t from 0 to n+1 do: T[s,t] := 0: od: od: op(T): end: # construct and return the set of all m-tuples of the form # [a_1,...,a_m] # where a_1 \in {0,...,n} # a_2,..,a_{m} \in {1,...,n} ConstrSeq := proc(n,m) local S,N,N1,N2,i,s: N:={seq(i, i=1..n)}: # our alphabet, excluding start/end states if m<1 then: print(`what are you even trying to do?`): return(FAIL): fi: if m = 1 then: return({seq([i], i=0..n)}): fi: # we may assume m>= 2 N1:= {0} union N: S:=SeqHelper(m-1,N): S:= {seq( seq([i, op(s)] ,s in S), i in N1)}: # handle the first coordinate end: # returns the set of all m-tuples where each coordinate lives in some set S. SeqHelper := proc(m,S) local s,t,T: if m = 0 then: return({[]}): fi: if m = 1 then: return({seq([s], s in S)}): fi: T:=SeqHelper(m-1,S): return({seq(seq([s,op(t)], t in T), s in S)}): # I really dislike using recursion, but I think my hand is forced. - Pablo end: ############################################ generate word functions: # generate a highly probable english word # of length k starting with letter a generateWord := proc(K,A) local M,W,current,beta,xi: M:= MakeTrM(K,1): current := op(alphaToNum([A])): W := [current]: # build word for beta from 1 to K-1 do # find index of most common letter after current letter # note we *could* change this to second or third most common letter to get more words # we check only rows 2-27 so we ignore start and end rows member(max(M[current + 1][2..27]),M[current + 1],'xi'): W := [op(W), xi-1]: current := xi-1: od: # convert from number indices back to letters # and check if word is an english word if not member(numToAlpha(W), ENG()[K]) then numToAlpha(W); else fail: fi: end: ### conversion helpers alpha := [a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z]: alphaToNum := proc(W) local word,eta,xi: word := []: for eta from 1 to nops(W) do member(W[eta],alpha,'xi'): word := [op(word),xi]: od: word: end: numToAlpha := proc(W) local word,eta: word := []: for eta from 1 to nops(W) do word := [op(word), alpha[W[eta]]]: od: word: end: ### some example calls and results # generateWord(5,c); ## [c, o, n, g, e] # generateWord(7,o); ## [o, n, g, e, r, e, r] # generateWord(4,q); ## [q, u, n, g] ### 04/15/2024 # modification of generateWord # now randomly picks one of the top R most common successor letters # at each step of word construction generateWordRandom := proc(K,A,R) local M,W,current,beta,xi,ra,topR,successor: M:= MakeTrM(K,1): current := op(alphaToNum([A])): W := [current]: # build word ra := rand(1..R): for beta from 1 to K-1 do # find index of most common letter after current letter # note we *could* change this to second or third most common letter to get more words # we check only rows 2-27 so we ignore start and end rows topR := sort(M[current + 1][2..27])[-R..-1]: successor := topR[ra()]: member(successor,M[current + 1],'xi'): W := [op(W), xi-1]: current := xi-1: od: # convert from number indices back to letters # and check if word is an english word if not member(numToAlpha(W), ENG()[K]) then numToAlpha(W); else fail: fi: end: # inputs list of rational numbers # randomly selects a list element based on weighting using size of rationals # for example, the list [1/4,3/4] should select "2" 75% of the time # note that the rationals need not have the same denominator and # need not sum to 1 weightedRand := proc(L) local commonDen,markers,LL,R,eta,xi: commonDen := 1: for eta from 1 to nops(L) do commonDen := commonDen * denom(L[eta]): od: LL := commonDen*L: # list of whole numbers markers := [seq( sum( LL[xi], xi=1..eta) ,eta=1..nops(LL))]: R := rand(1..markers[-1])(): for eta from 1 to nops(L) do if R <= markers[eta] then RETURN(L[eta]): fi: od: fail: end: weightedRand2 := proc(L) local commonDen,markers,LL,R,eta,xi: commonDen := 1: for eta from 1 to nops(L) do commonDen := commonDen * denom(L[eta]): od: LL := commonDen*L: # list of whole numbers markers := [seq( sum( LL[xi], xi=1..eta) ,eta=1..nops(LL))]: R := rand(1..markers[-1])(): for eta from 1 to nops(L) do if R <= markers[eta] then RETURN(eta): fi: od: fail: end: ## sample call # generateWordRandom(4,a,5); # can rerun repeatedly for different results! ### 04/19/2024 # modification of generateWord # now randomly picks one of the top R most common successor letters # at each step of word construction -- USING A WEIGHTED RANDOM SELECTOR # notice that this one seems to return fail more often. makes sense if you think about it generateWordRandWted := proc(K,A,R) local M,W,current,beta,xi,topR,successor: M:= MakeTrM(K,1): current := op(alphaToNum([A])): W := [current]: # build word for beta from 1 to K-1 do # find index of most common letter after current letter # note we *could* change this to second or third most common letter to get more words # we check only rows 2-27 so we ignore start and end rows topR := sort(M[current + 1][2..27])[-R..-1]: successor := weightedRand(topR): member(successor,M[current + 1],'xi'): W := [op(W), xi-1]: current := xi-1: od: # convert from number indices back to letters # and check if word is an english word if not member(numToAlpha(W), ENG()[K]) then numToAlpha(W); else fail: fi: end: ## sample call # generateWordRandWted(4,a,5); # compare to unweighted version this way # input state size which should be >=2 and a starting letter?? can be done later. generateWordSt := proc(stSize, opti:=-1) local L,let,W,list,bt,a1,a2: # generate stSize many words first: list:=MakeTrM(3,1)[1]: # first letter transition if opti = -1 then let:=weightedRand2([op(2..28, list)]): # this is our first letter! elif opti>=1 and opti<=26 then let:=opti: fi: W:=[0,let]: # this is now a table while nops(W) < stSize do: L:=MakeTrM(3,nops(W)): let:=weightedRand2( [seq( L[W,bt] ,bt=1..27 )] ): W:=[op(W),let]: od: a1:=1: a2:=a1+stSize-1: L:=MakeTrM(stSize,stSize): while W[-1]<>27 do: let:=weightedRand2( [seq( L[ [op(a1..a2 ,W)] ,bt] ,bt=1..27 )] ): W:=[op(W),let]: a1:=a1+1: a2:=a1+stSize-1: od: W:=[op(2..nops(W)-1,W)]: # convert from number indices back to letters # and check if word is an english word if nops(W)>=29 then: return(numToAlpha(W)); fi: if not member(numToAlpha(W),ENG()[nops(W)]) then numToAlpha(W); else fail: fi: end: