######################################################################
## EM19Proj1.txt Save this file as   EM19Proj1.txt to use it,        #
# stay in the                                                        #
## same directory, get into Maple (by typing: maple <Enter> )        #
## and then type:  read   EM19Proj1.txt: <Enter>                     #
## Then follow the instructions given there                          #
##                                                                   #
## Written by students of Dr. Z.'s Math 640 , Spring 2019, class     #
##  Coordinated by Yukun Yao, yao@math.rutges.edu                    # 
#####################################################################

with(CurveFitting):
with(linalg):
with(SignalProcessing):


read `DATA.txt`:

print(`This is EM19Proj1.txt, a final project for Dr. Z.'s Math 640, Spring 2019, class, Math 640, written by`):
print(`Yukun Yao (coordinator), Victoria Chayes, .....,   `):

print(`To analyze the relationship between Factulty accoplishment and rewards (rank and salary), and among them`):

Help:=proc()
if args=NULL then

print(` EM19Proj1.txt: A Maple package for  Analyzing Data in Maple given in terms of an ABT `):
print(`with the format [FirstName,LastName,Featues , ... ]`):
print(`The MAIN procedures are: Cor, ExtractFields, SubLists `):
print(`The artificial neural net procedures are: NN1(DATA,N,reg,max_epochs,alph), Test1(x,W,b,N), preproc(DATA), and WeightInitializeGauss(rows,cols)`):

elif nargs=1 and args[1]=NN1 then
print(`1. Call L:=ExtractFields(MR, [descriptive feature 1,descriptive feature 2,..., descriptive feature k,3])`):
print(`2. Then using the Statistics package in maple, let L:=Shuffle(L) so as to unsort the data.`):
print(`3. Preprocess the data to normalize it by calling L:=preproc(L)`):
print(`4. Take first 45 (or your choice) entries for the input L by Ltrain:=[op(1..45,L)]`):
print(`5. Choose "hyper-parameters" reg=regulariztion strength, max_epochs=number of times to update the ANN alph=learning rate and run J:=NN1(L,4,reg,max_epochs,alph): We found that letting reg=0.001, max_epochs=200, and alph=0.1 was successful.`):
print(`The list J is of length three and contains the trained matrix and vector corresponding to the linear transformation. The third 
entry of J is a list of pairs [epoch,loss] which can be used to visualize the loss over time`):

elif nargs=1 and args[1]=Cor then
print(`Cor(L): inputs a list of pairs [P,S] outputs the corrletion. Try`):
print(`Cor(ExtractFields(MR,[4,7])); `):

elif nargs=1 and args[1]=ExtractFields then
print(`ExtractFields(L,M): inputs a list of lists L and a list M , produces a list of sublists with only the features given by M in that order`):
print(`For example, try:`):
print(`ExtractFields(MR,[7,1,2]);`):

elif nargs=1 and args[1]=SubLists then
print(`SubLists(L,I1,J1): given a list L of lists where in each item, the I1-th list describes a categorical featues from 1 to J1`):
print(`outputs the individual lists for each of the categorical features. Try:`):
print(`SubLists(MR,3,4);`):

print(``):

else
 print(`There is no such thing as`, args):

fi:


end:


#SubLists(L,I1,J1): given a list L of lists where in each item, the I1-th list describes a categorical featues from 1 to J1
#outputs the individual lists for each of the categorical features. Try:
#SubLists(MR,3,4);
SubLists:=proc(L,I1,J1) local T,J1A,K:

if not (type(L,list)  and {seq(type(L[K],list),K=1..nops(L))}={true} and nops({seq(nops(L[K]),K=1..nops(L))})=1 ) then
 print(`Bad input`):
 RETURN(FAIL):
fi:


if {seq(L[K][I1],K=1..nops(L))} minus {seq(K,K=1..J1)}<>{} then
 print(`Bad input`):
 RETURN(FAIL):
fi:
  

for J1A from 1 to J1 do
  T[J1A]:=[]:
od:

for K from 1 to nops(L) do
 T[L[K][I1]]:=  [op(T[L[K][I1]]),L[K]]:
od:

[seq(T[K],K=1..J1)]:
end:


#ExtractFields(L,M): inputs a list of lists L and a list M , produces a a list with only the features given by M in that order
#and sorted
#For example, try:
#ExtractFields(MR,[7,1,2]);
ExtractFields:=proc(L,M) local K1,K2:

sort([seq([seq(L[K1][M[K2]],K2=1..nops(M))],K1=1..nops(L))]):

end:

#AveAndSD(L): the average and s.d. of L
AveAndSD:=proc(L) local i,mu,sig:
mu:=evalf(add(L[i],i=1..nops(L))/nops(L)):
sig:=sqrt(add((L[i]-mu)^2,i=1..nops(L))/nops(L)):
[mu,sig]:
end:

#Cor(L): inputs a list of pairs [P,S] outputs the corrletion
Cor:=proc(L) local L1,L2,i,A,B,mu1,mu2,sig1,sig2:
L1:=[seq(L[i][1],i=1..nops(L))]:
L2:=[seq(L[i][2],i=1..nops(L))]:
A:=AveAndSD(L1):
B:=AveAndSD(L2):
mu1:=A[1]: sig1:=A[2]: mu2:=B[1]: sig2:=B[2]:

[A,B,add((L1[i]-mu1)*(L2[i]-mu2),i=1..nops(L))/(nops(L)*sig1*sig2)]:
end:


#######################BEGIN ANN#####################################

#This ONE-layer ANN uses the soft-max function for classification along with the cross-entropy loss function.
#We follow ideas from http://cs231n.github.io/neural-networks-case-study/#linear
#We want to predict probabilities that a certain professor (represented by [descriptive feature 1,descriptive feature 2,..., descriptive feature k])
#has of being a given rank. From a quick scan online, it seems best to use the soft-max classifier,
# so the output of the ANN is a list y of length four with entries between 0 and 1 which sum to 1.
#y[i] is the probability that a given professor is of rank i.

#TO USE THIS ANN USING THE MR DATA: 
#1. Call L:=ExtractFields(MR, [descriptive feature 1,descriptive feature 2,..., descriptive feature k,3]),
#2. Using the Statistics package in maple, let L:=Shuffle(L) so as to unsort the data.
#3. Preprocess the data to normalize it by calling L:=preproc(L)
#4. Take first 45 (or your choice) entries for the input L by Ltrain:=[op(1..45,L)]
#5. Choose `hyper-parameters` reg=regulariztion strength, max_epochs=number of times to update the ANN
#  alph=learning rate and run J:=NN1(L,4,reg,max_epochs,alph): We found that letting reg=0.001, max_epochs=200, and alph=0.1
#	was successful. Here we have 4 features, but we let N be a parameter since one could also train the neural net 
# to predict different targets, such as AMS fellowness, which would correspond to N=2.
 
NN1:=proc(DATA,N,reg,max_epochs,alph) local x,K,y,h,i,j,t,L,loss,W,l,b,X,size,score,exp_score,k,new,tot,dscores,XT,su,std:
x:=DATA[1]:
#Putting data in nice format: We create a matrix which puts the feature data of each data point in a column of a matrix.
K:=nops(x)-1:
X:=[]:
t:=[]:
W:=0.01*WeightInitializeGauss(K,N):
size:=nops(DATA):
for i from 1 to size do
	new:=[op(1..K,DATA[i])]:

	X:=[op(X),new]:

	new:=[0$4]: #Four ranks of professor
	new[DATA[i][K+1]]:=1:

	t:=[op(t),new]:
od:
#Now we normalize data to have mean 0 and stand deviation 1
#for i from 1 to 3 do
#	su:=0:
#	su:=add(X[j][i],j=1..size)/size:
#	std:=sqrt(add((X[j][i]-su)^2,j=1..size)/size):
#	for l from 1 to size do
#		X[l,i]:=(X[l,i]-su)*(1/std):
#	od:
#od:
X:=matrix(X):
t:=matrix(t):
b:=[0$N]:
L:=[]:
loss:=margin*2:
for k from 1 to max_epochs do

	score:=multiply(X,W): #We add b in the next row due to differences between python and maple
	exp_score:=matrix(size,N,(i,j)->evalf(exp(score[i,j]+b[j]))): #CHANGES WITH NUMBER OF FEATURES

	tot:=matrix(size,1,(i,j)->add(exp_score[i,l],l=1..N)):

	for i from 1 to size do
		for j from 1 to N do
			exp_score[i,j]:=exp_score[i,j]*(1/tot[i,1]):
		od:
	od:
	loss:=0:
	for i from 1 to size do
		loss:=loss+evalf(-log(exp_score[i,DATA[i][K+1]])):
	od:
	loss:=loss/size:

	L:=[op(L),[k,loss]]:
	dscores:=exp_score:

	for i from 1 to size do
		dscores[i,DATA[i][K+1]]:=dscores[i,DATA[i][K+1]]-1:
	od:
	#dscores:=dscores*0.45:
	XT:=transpose(X):
	W:=evalm(evalm(W-alph*multiply(XT,dscores))-alph*reg*W):
	for i from 1 to N do
		b[i]:=b[i]-alph*add(dscores[j,i],j=1..size):
	od:
	if k mod 10 = 0 then
		print(`After iteration`,  k , ` Loss= `,  loss):
	fi:
od:

[W,b,L]:
end:

WeightInitializeGauss:=proc(rows,cols) local L,W:
	L:=GenerateGaussian(rows*cols,0,1):
	W:=matrix(rows,cols,(i,j)->L[(i-1)*cols+j]):
end:

#Call Test1 with normalized data (since that it what was used to train the network)
#Test1 takes in x, one piece of normalized data, W a matrix of appropriate size (nops(x) by target features) and a vector bb
#of length number of target features, which is also the parameter N. Outputs the list of probabilities predicted
#by the ANN.
Test1:=proc(x,W,b,N) local xx,bb,exp_score,score,nor,w,K:
	K:=nops(x)-1:
	xx:=[op(1..K,x)]:
	xx:=matrix([xx]):
	bb:=matrix([b]):
	w:=matrix(evalf(W)):
	score:=evalm(multiply(xx,W)+bb):
	exp_score:=[seq(exp(score[1,j]),j=1..N)]:
	nor:=add(exp_score[i],i=1..N):
	exp_score:=evalf(exp_score*(1/nor)):
end:

#Takes in data a list of lists [descriptive 1,...descriptive k, target feature] and normalizes the descriptive features.
preproc:=proc(DATA) local i,j,su,std,size,K,X,x,l:
x:=DATA[1]:
#Putting data in nice format: We create a matrix which puts the feature data of each data point in a column of a matrix.
K:=nops(x)-1:
X:=[]:
size:=nops(DATA):
for i from 1 to size do
	X:=[op(X),DATA[i]]:
od:
for i from 1 to K do
	su:=0:
	su:=add(X[j][i],j=1..size)/size:
	std:=sqrt(add((X[j][i]-su)^2,j=1..size)/size):
	for l from 1 to size do
		X[l,i]:=(X[l,i]-su)*(1/std):
	od:
od:
X:
end:


WeightInitialize:=proc(rows,cols,max) local i,j,Mat,ra:
ra:=rand(-max..max):
Mat:=[seq([seq(ra(),j=1..cols)],i=1..rows)]:
end: