#Logistic regression package
#by Jason Saied

read `DATA.txt`:
read `EM19Proj1.txt`:

Help:=proc()
if args=NULL then

print(` EM19Proj1.txt: A Maple package for  Analyzing Data in Maple given in terms of an ABT `):
print(`with the format [FirstName,LastName,Features , ... ]`):
print(`The MAIN procedures are: NormPrep, LogRegress`):
print(`   `):

elif nargs=1 and args[1]=NormPrep then
print(`NormPrep(D,Descs,Target,k,L): input a data set D in the above format, a list Descs of the indices of the descriptive features in which you are interested, a positive number Target indicating the index of the target feature, a number k, and a list L of approximate upper bounds for the values of the chosen descriptive features. Outputs the data set in the format expected by LogRegress, normalizing the descriptive features by the scalars in L so that the gradient descent converges more quickly.`):
print(`For example, try:`):
print(`NormPrep(MR, [4,5,6],3,3,[400,5000,30])`):

elif nargs=1 and args[1]=LogRegress then
print(`LogRegress(D,rate, K, x, tolerance, numTries): input a data set D in the format [[Descriptive Features],Target Feature] (where Target is 0 or 1), a symbol x, positive numbers rate, tolerance, and K, and a positive integer numTries. Uses gradient descent to find the weights in the logistic regression for D, with starting weights chosen in [-K,K] and learning rate called rate. Stops when the sum of squared errors is less than tolerance, or when numTries has been exceeded. Outputs a list whose first entry is the sum of squared errors and whose second entry is the regression function, a function in the variables x[i].`):
print(`For example, try:`):
print(`LogRegress(NormPrep(MR, [4,5,6],3,3,[400,5000,30]), 2.0, 1.0, x, 3.15, 10000) `):

else
 print(`There is no such thing as`, args):

fi:


end:

#LogRegress(D,rate, K, x, tolerance, numTries): uses gradient descent to calculate the logistic regression. Returns a list consisting of the sum of squared errors and the regression function. 
#D: the data set. List of lists of the form [[value of descriptive feature 1, ..., value of descriptive feature n],value of target feature]
#the target feature is assumed to take only the values 0 and 1
#rate: the learning rate 
#K: a floating point number, determines how large we will allow the starting weights to be
#x: a variable x for the final regression formula
#tolerance: once the sum of squared errors is below tol, stop
#numTries: if it takes more than this many tries, return FAIL
LogRegress:=proc(D, rate, K, x, tolerance, numTries) local w, m, n, i, j, ra, E, count:

	m:=nops(D[1][1]):
	n:=nops(D):
	count:=0:
	
	ra:=rand(-K..K):
	
	#pick random starting point in weight space
	for i from 0 to m do
		w[i]:=ra():
	od:
	
	E:=tolerance+1:
	
	while (E>=tolerance and count < numTries) do
	
		#if (count mod 100)=0 then
		#	print(op(op(w))):
		#fi:
	
		count:=count+1:
		
		for j from 0 to m do
		
			#print(rate*ED(D,w,j)):
			w[j]:=w[j]+rate*ED(D,w,j):
		
		od:
		
		#calculate the sum of squared errors
		E:=add((D[i][2]-evalf(Logi(w,D[i][1])))^2,i=1..n):
		#if (count mod 100)=0 then
		#	print(E):
		#fi:
		
	od:
	
	#print(`error is `,E):
	
	#print(count):
	
	return [E,evalf(Logi(w,x))]:

end:


#Logi(w,d): calculates logistic(w[0]+w[1]*d[1]+...+w[m]*d[m]), where logistic(x)=1/(1+e^(-x))
#w: the current set of weights (a table)
#d: the values of the descriptive features of a data point
Logi:=proc(w,d):

	return evalf(1/(1+exp(-(w[0]+add(w[i]*d[i], i=1..(nops(op(op(w)))-1)))))):

end:

#ED(D,w,j): calculates the error delta for multivariable logistic regression
#D: the data set
#w: the current vector of weights
#j: the w[j] we want to update
ED:=proc(D,w,j) local i, tempSum, tempLogi, relevantDesc:

	tempSum:=0:
	
	for i from 1 to nops(D) do
		
		#j=0 case is different
		if j=0 then
			relevantDesc:=1:
		else
			relevantDesc:=D[i][1][j]:
		fi:
	
		tempLogi:=Logi(w,D[i][1]):
		
		tempSum:=tempSum + (D[i][2]-tempLogi)*tempLogi*(1-tempLogi)*relevantDesc:
	
	od:

	return tempSum:

end:


######methods for our particular application

#PrepData(D,Descs,Target,k): extracts the specified descriptive features and target features from the data set D, setting the new target feature to be 1 if its value is >=k and 0 otherwise
#puts into format [[descriptive],target]
#D: the data set in its original form
#Descs: a list of the features that will be target features (indices of the lists in D)
#Target: the index of what will be the target feature
#k: we will assign a data point 1 if the value of the target feature is >=k
PrepData:=proc(D, Descs, Target, k) local NewD, i, n:

	#extract only the information we need
	NewD:=ExtractFields(D,[op(Descs),Target]):
	
	n:=nops(NewD[1]):
	
	#reformat so that the target feature is separate
	NewD:=[seq([[op(1..n-1, NewD[i])],Conv(NewD[i][n], k)], i=1..nops(D))]:

end:

#Conv(r,k): used in PrepData. If r>=k, return 1; else, return 0.
Conv:=proc(r,k):

	if r>=k then
		return 1:
	fi:
	return 0:

end:

#Normalize(D,L) normalizes the data set D according to the list L of bounds
#D: the data set in the form [[Descriptive],Target] as given by PrepData
#L: the list of approximate bounds by which to divide the descriptive features
Normalize:=proc(D,L) local i,j:
	
	[seq([[seq(evalf(D[i][1][j]/L[j]), j=1..nops(L))],D[i][2]], i=1..nops(D))]:

end:

#NormPrep(D,Descs,Target,k,L): extracts the specified descriptive features and target features from the data set D, setting the new target feature to be 1 if its value is >=k and 0 otherwise
#puts into format [[descriptive],target]
#normalizes according to the list L, dividing the values of descriptive feature i by L[i]
#D: the data set in its original form
#Descs: a list of the features that will be target features (indices of the lists in D)
#Target: the index of what will be the target feature
#k: we will assign a data point 1 if the value of the target feature is >=k
#L: the list of approximate bounds by which to divide the descriptive features
NormPrep:=proc(D,Descs,Target,k,L):

	Normalize(PrepData(D,Descs,Target,k),L):

end: