# Final Project -- Spring 2024 # Statistics of English Language # Members: Pablo Blanco, Aurora Hiveley, Kaylee Weatherspoon # Due date: May 1, 2024 at 11 PM read(`ENGLISH.txt`): # this file should contain all english words as a list of lists Help:= proc(): if args=NULL then print(`The supported procedures are: MakeTrM, Swap, DevTools`): print(`Try, for example: Help(MakeTrM)`): else: HelpHelper(args): fi: end: HelpHelper:=proc(): if nargs<>1 then: print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`): print(`Too many arguments!`): print(`<><><><><><><><><><><><><><><><><><><><><><>`): return(): elif args[1]=MakeTrM then print(`******************************************************************************************`): print(`MakeTrM(k): constructs a 28x28 transition probability matrix by using data in ENGLISH.txt.`): print(`Returns this matrix as a list of lists, where the n-th row corresponds to the n-th letter`): print(`of the alphabet. The 0th and 27th rows correspond to the \"start\" and \"end\" states,`): print(`respectively. For example, try: `): print(`MakeTrM(3)[2,3]`): print(`to obtain the transition probability from \"a\" to \"b\" in words of length at least 3.`): print(`******************************************************************************************`): elif args[1]=Swap then print(`******************************************************************************************`): print(`Swap(LETTER): given a letter, given as a string, tells you its index in the alphabet. Try`): print(`Swap("h")`): print(`to see that h is the 8th letter of the alphabet.`): print(`******************************************************************************************`): elif args[1]=DevTools then print(`******************************************************************************************`): print(`DevTools(): the Help command but for developer procedures. Try it!`): print(`******************************************************************************************`): else: print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`): print(`Help() does not support the following procedure:`): print(args): print(`<><><><><><><><><><><><><><><><><><><><><><>`): return(): fi: end: # This is like Help() but for procedures only intended for development. DevTools:=proc(): if args=NULL then print(`The supported procedures are: ConvertInd, ModifMtx`): else: DevTools1(args): fi: end: DevTools1:=proc(): if nargs<>1 then: print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`): print(`Too many arguments!`): print(`<><><><><><><><><><><><><><><><><><><><><><>`): return(): elif args[1]=ConvertInd then print(`******************************************************************************************`): print(`ConvertInd(M): Given a list of lists, converts its elements (words) into numerical`): print(`sequences corresponding to their alphabetically-indexed counterpart. Example:`): print(`ConvertInd([[t,a,c,o],[c,a,t]])`): print(`returns [[20, 1, 3, 15], [3, 1, 20]] because \"t\" is the 20th letter of the alphabet and`): print(`so on...`): print(`******************************************************************************************`): elif args[1]=ModifMtx then print(`******************************************************************************************`): print(`ModifMtx(k,L): Given a positive integer k and a list of matrices (lists of lists) L,`): print(`this procedure returns a list of the ROWS of length at least k. Example:`): print(`ModifMtx(2,[ [[1]], [[2,2],[3,3]] ])`): print(`will return [[2,2],[3,3]]`): print(`******************************************************************************************`): else: print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`): print(`Help() does not support the following procedure:`): print(args): print(`<><><><><><><><><><><><><><><><><><><><><><>`): return(): fi: end: # Note: the given english file contains some sketchy two letter words # For instance "[c,l]". I would like to ignore these. # MakeTrM(k1): constructs a transition matrix using all english words of length >= k. # Returns a list of lists with the first column/row denoting "start" and the last row (28) denoting "end" # MakeTrM(k1,1): returns a table T instead. T[["start","a"]] will return the probability that a word begins with "a" MakeTrM:=proc(k1,o1:=0) local E,F,n,m,i,j,Counts,colTotal,T,s: option remember: # Only consider words of length >=k if k1 = 0 or k1 = 1 then E:=[seq(op(s),s in ENG())]: else: E:=ModifMtx(k1, ENG()): #E:=TruncMtx(k1, [seq(op(s),s in ENG())]): fi: E:=ConvertInd(E): # convert all words into number lists. this is done to preserve data and for convenience. n:=nops(E): # number of words Counts:=[[0$28]$28]: # number of transitions colTotal:=[[0]$28]: # number of appearances colTotal[1][1]:=n: # every word has a start and an end colTotal[28][1]:=n: for i from 1 to n do: Counts[1][E[i][1]+1]:=Counts[1][E[i][1]+1]+1: # the first row is dedicated to "start" m:=nops(E[i]): # word length for j from 1 to m-1 do: colTotal[E[i][j]+1][1]:= colTotal[E[i][j]+1][1]+1: # letter E[i][j]+1 has appeared! note: "a" is colmn 2 but 1 in E. # terrible to read, I know. update transition count for j to j+1th letter in the ith word: Counts[E[i][j]+1][E[i][j+1]+1]:=Counts[E[i][j]+1][E[i][j+1]+1] + 1: od: Counts[E[i][m]+1][28]:=Counts[E[i][m]+1][28]+1: # update "end" count colTotal[E[i][m]+1][1]:= colTotal[E[i][m]+1][1]+1: od: T:= [seq([seq(Counts[j][i]/max(colTotal[j][1],1),i=1..28)],j=1..28)]: # transition matrix! if o1=1 then: #op(table([seq(,i=1..)])): # code anti-Swap and then this is easy fi: T: #colTotal: end: # given a list of lists, removes all lists with fewer than k elements. TruncMtx := proc(k,M) local n,S,s,i: n:=nops(M): S:=[]: for i from 1 to n do: if nops(M[i])>=k then: S:=[op(S),i]: fi: od: [seq(M[s],s in S)]: end: # given a list of square matrices, consider only matrices with row length at least k. # return a list of the rows of such matrices. ModifMtx := proc(k,M) local m,S: S:=[]: for m in M do: if nops(m)>=1 and nops(m[1])>=k then: S:=[op(S), op(m)]: fi: od: S: end: # converts the letters in (a list of lists) L into their alphabet index. ConvertInd := proc(L) local alpha,R,i1,j1: alpha:=table([a=1,b=2,c=3,d=4,e=5,f=6,g=7,h=8,i=9,j=10,k=11,l=12,m=13,n=14,o=15,p=16,q=17,r=18,s=19,t=20,u=21,v=22,w=23,x=24,y=25,z=26]): R:=[seq([seq( alpha[L[i1][j1]], j1=1..nops(L[i1]))],i1=1..nops(L))]: end: # changes ones letter STRING to its alphabetic index. Do NOT put into ConvertInd. It breaks it? not sure if it still does. # example: Swap("c") returns 3. Swap:=proc(x1) local alpha: alpha:=table(["a"=1,"b"=2,"c"=3,"d"=4,"e"=5,"f"=6,"g"=7,"h"=8,"i"=9,"j"=10,"k"=11,"l"=12,"m"=13,"n"=14,"o"=15,"p"=16,"q"=17,"r"=18,"s"=19,"t"=20,"u"=21,"v"=22,"w"=23,"x"=24,"y"=25,"z"=26]): alpha[x1]: end: # initializes a transition matrix/table for an alphabet of length n and m-letter states. 0 always denotes "start" and n+1 denotes "end" InitMtx:=proc(n,m) local T,S,s,t: T:=table([]): S:=ConstrSeq(n,m): for s in S do: for t in S do: T[s,t] := 0: od: od: op(T): end: # construct all m-tuples of the form #[a_1,...,a_m] # where a_1 \in {0,...,m} # a_2 \in {1,...,m-1} # a_m \in {1,..., m+1} ConstrSeq := proc(n,m) local S: S:={}: end: