# Final Project -- Spring 2024
# Statistics of English Language
# Members: Pablo Blanco, Aurora Hiveley, Kaylee Weatherspoon
# Due date: May 1, 2024 at 11 PM

read(`ENGLISH.txt`): # this file should contain all english words as a list of lists


Help:= proc():
if args=NULL then
 print(`The supported procedures are: MakeTrM, Swap, DevTools`):
 print(`Try, for example: Help(MakeTrM)`):
else:
 HelpHelper(args):
fi:

end:

HelpHelper:=proc():
 if nargs<>1 then:
    print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`):
  print(`Too many arguments!`):
    print(`<><><><><><><><><><><><><><><><><><><><><><>`):
  return():
 elif args[1]=MakeTrM then
  print(`******************************************************************************************`):
  print(`MakeTrM(k): constructs a 28x28 transition probability matrix by using data in ENGLISH.txt.`):
  print(`Returns this matrix as a list of lists, where the n-th row corresponds to the n-th letter`): 
  print(`of the alphabet. The 0th and 27th rows correspond to the \"start\" and \"end\" states,`):
  print(`respectively. For example, try: `):
  print(`MakeTrM(3)[2,3]`):
  print(`to obtain the transition probability from \"a\" to \"b\" in words of length at least 3.`):
  print(`******************************************************************************************`):
 elif args[1]=Swap then
  print(`******************************************************************************************`):
  print(`Swap(LETTER): given a letter, given as a string, tells you its index in the alphabet. Try`):
  print(`Swap("h")`):
  print(`to see that h is the 8th letter of the alphabet.`):
  print(`******************************************************************************************`):
 elif args[1]=DevTools then
  print(`******************************************************************************************`):
  print(`DevTools(): the Help command but for developer procedures. Try it!`):
  print(`******************************************************************************************`):
 else:
  print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`):
  print(`Help() does not support the following procedure:`):
  print(args):
  print(`<><><><><><><><><><><><><><><><><><><><><><>`):
  return():
 fi:
end:

# This is like Help() but for procedures only intended for development.
DevTools:=proc():
 if args=NULL then
  print(`The supported procedures are: ConvertInd, ModifMtx`):
 else:
  DevTools1(args):
 fi:
end:

DevTools1:=proc():
 if nargs<>1 then:
    print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`):
  print(`Too many arguments!`):
    print(`<><><><><><><><><><><><><><><><><><><><><><>`):
  return():
 elif args[1]=ConvertInd then
   print(`******************************************************************************************`):
   print(`ConvertInd(M): Given a list of lists, converts its elements (words) into numerical`):
   print(`sequences corresponding to their alphabetically-indexed counterpart. Example:`):
   print(`ConvertInd([[t,a,c,o],[c,a,t]])`):
   print(`returns [[20, 1, 3, 15], [3, 1, 20]] because \"t\" is the 20th letter of the alphabet and`):
   print(`so on...`):
   print(`******************************************************************************************`):
 elif args[1]=ModifMtx then
   print(`******************************************************************************************`):
   print(`ModifMtx(k,L): Given a positive integer k and a list of matrices (lists of lists) L,`):
   print(`this procedure returns a list of the ROWS of length at least k. Example:`):
   print(`ModifMtx(2,[ [[1]], [[2,2],[3,3]] ])`):
   print(`will return [[2,2],[3,3]]`):
   print(`******************************************************************************************`):
 else:
  print(`<><><><><><><><><><>ERROR<><><><><><><><><><>`):
  print(`Help() does not support the following procedure:`):
  print(args):
  print(`<><><><><><><><><><><><><><><><><><><><><><>`):
  return():
 fi:
end:




# Note: the given english file contains some sketchy two letter words
# For instance "[c,l]". I would like to ignore these.

# MakeTrM(k1): constructs a transition matrix using all english words of length >= k.
# Returns a list of lists with the first column/row denoting "start" and the last row (28) denoting "end"
# MakeTrM(k1,1): returns a table T instead. T[["start","a"]] will return the probability that a word begins with "a"
MakeTrM:=proc(k1,o1:=0) local E,F,n,m,i,j,Counts,colTotal,T,s:
 option remember:
 
 # Only consider words of length >=k
 if k1 = 0 or k1 = 1 then
  E:=[seq(op(s),s in ENG())]:
 else: 
  E:=ModifMtx(k1, ENG()):
  #E:=TruncMtx(k1, [seq(op(s),s in ENG())]):
 fi:
  
 E:=ConvertInd(E):          # convert all words into number lists. this is done to preserve data and for convenience.
 n:=nops(E):                # number of words
 Counts:=[[0$28]$28]:       # number of transitions
 colTotal:=[[0]$28]:        # number of appearances
 colTotal[1][1]:=n:            # every word has a start and an end
 colTotal[28][1]:=n:
 
 for i from 1 to n do:
  Counts[1][E[i][1]+1]:=Counts[1][E[i][1]+1]+1: # the first row is dedicated to "start"
  m:=nops(E[i]): # word length
  for j from 1 to m-1 do:
   colTotal[E[i][j]+1][1]:= colTotal[E[i][j]+1][1]+1: # letter E[i][j]+1 has appeared! note: "a" is colmn 2 but 1 in E.
   # terrible to read, I know. update transition count for j to j+1th letter in the ith word:
   Counts[E[i][j]+1][E[i][j+1]+1]:=Counts[E[i][j]+1][E[i][j+1]+1] + 1: 
  od:
  Counts[E[i][m]+1][28]:=Counts[E[i][m]+1][28]+1: # update "end" count
  colTotal[E[i][m]+1][1]:= colTotal[E[i][m]+1][1]+1:
 od:

 T:= [seq([seq(Counts[j][i]/max(colTotal[j][1],1),i=1..28)],j=1..28)]: # transition matrix!

 if o1=1 then:
  #op(table([seq(,i=1..)])):  # code anti-Swap and then this is easy
 fi:
 
 T:

 #colTotal:
end:

# given a list of lists, removes all lists with fewer than k elements.
TruncMtx := proc(k,M) local n,S,s,i:
 n:=nops(M):
 S:=[]:
 for i from 1 to n do:
  if nops(M[i])>=k then:
   S:=[op(S),i]:
  fi:
 od:
 [seq(M[s],s in S)]:
end:

# given a list of square matrices, consider only matrices with row length at least k.
# return a list of the rows of such matrices.
ModifMtx := proc(k,M) local m,S:
 S:=[]:
 for m in M do:
  if nops(m)>=1 and nops(m[1])>=k then:
   S:=[op(S), op(m)]:
  fi:
 od:
 S:
end:

# converts the letters in (a list of lists) L into their alphabet index. 
ConvertInd := proc(L) local alpha,R,i1,j1:
 alpha:=table([a=1,b=2,c=3,d=4,e=5,f=6,g=7,h=8,i=9,j=10,k=11,l=12,m=13,n=14,o=15,p=16,q=17,r=18,s=19,t=20,u=21,v=22,w=23,x=24,y=25,z=26]):
 R:=[seq([seq( alpha[L[i1][j1]], j1=1..nops(L[i1]))],i1=1..nops(L))]:
end:


# changes ones letter STRING to its alphabetic index. Do NOT put into ConvertInd. It breaks it? not sure if it still does.
# example: Swap("c") returns 3.
Swap:=proc(x1) local alpha:
 alpha:=table(["a"=1,"b"=2,"c"=3,"d"=4,"e"=5,"f"=6,"g"=7,"h"=8,"i"=9,"j"=10,"k"=11,"l"=12,"m"=13,"n"=14,"o"=15,"p"=16,"q"=17,"r"=18,"s"=19,"t"=20,"u"=21,"v"=22,"w"=23,"x"=24,"y"=25,"z"=26]):
 alpha[x1]:
end:

# initializes a transition matrix/table for an alphabet of length n and m-letter states. 0 always denotes "start" and n+1 denotes "end"
InitMtx:=proc(n,m) local T,S,s,t:
 T:=table([]):
 S:=ConstrSeq(n,m):
 for s in S do:
  for t in S do:  
   T[s,t] := 0:
  od:
 od: 
 op(T):
end:

# construct all m-tuples of the form

#[a_1,...,a_m]

# where a_1 \in {0,...,m}

# a_2 \in {1,...,m-1}

# a_m \in {1,..., m+1}

ConstrSeq := proc(n,m) local S:
 
 S:={}:

end: