#Logistic regression package #by Jason Saied read `DATA.txt`: read `EM19Proj1.txt`: Help:=proc() if args=NULL then print(` EM19Proj1.txt: A Maple package for Analyzing Data in Maple given in terms of an ABT `): print(`with the format [FirstName,LastName,Features , ... ]`): print(`The MAIN procedures are: NormPrep, LogRegress`): print(` `): elif nargs=1 and args[1]=NormPrep then print(`NormPrep(D,Descs,Target,k,L): input a data set D in the above format, a list Descs of the indices of the descriptive features in which you are interested, a positive number Target indicating the index of the target feature, a number k, and a list L of approximate upper bounds for the values of the chosen descriptive features. Outputs the data set in the format expected by LogRegress, normalizing the descriptive features by the scalars in L so that the gradient descent converges more quickly.`): print(`For example, try:`): print(`NormPrep(MR, [4,5,6],3,3,[400,5000,30])`): elif nargs=1 and args[1]=LogRegress then print(`LogRegress(D,rate, K, x, tolerance, numTries): input a data set D in the format [[Descriptive Features],Target Feature] (where Target is 0 or 1), a symbol x, positive numbers rate, tolerance, and K, and a positive integer numTries. Uses gradient descent to find the weights in the logistic regression for D, with starting weights chosen in [-K,K] and learning rate called rate. Stops when the sum of squared errors is less than tolerance, or when numTries has been exceeded. Outputs a list whose first entry is the sum of squared errors and whose second entry is the regression function, a function in the variables x[i].`): print(`For example, try:`): print(`LogRegress(NormPrep(MR, [4,5,6],3,3,[400,5000,30]), 2.0, 1.0, x, 3.15, 10000) `): else print(`There is no such thing as`, args): fi: end: #LogRegress(D,rate, K, x, tolerance, numTries): uses gradient descent to calculate the logistic regression. Returns a list consisting of the sum of squared errors and the regression function. #D: the data set. List of lists of the form [[value of descriptive feature 1, ..., value of descriptive feature n],value of target feature] #the target feature is assumed to take only the values 0 and 1 #rate: the learning rate #K: a floating point number, determines how large we will allow the starting weights to be #x: a variable x for the final regression formula #tolerance: once the sum of squared errors is below tol, stop #numTries: if it takes more than this many tries, return FAIL LogRegress:=proc(D, rate, K, x, tolerance, numTries) local w, m, n, i, j, ra, E, count: m:=nops(D[1][1]): n:=nops(D): count:=0: ra:=rand(-K..K): #pick random starting point in weight space for i from 0 to m do w[i]:=ra(): od: E:=tolerance+1: while (E>=tolerance and count < numTries) do #if (count mod 100)=0 then # print(op(op(w))): #fi: count:=count+1: for j from 0 to m do #print(rate*ED(D,w,j)): w[j]:=w[j]+rate*ED(D,w,j): od: #calculate the sum of squared errors E:=add((D[i][2]-evalf(Logi(w,D[i][1])))^2,i=1..n): #if (count mod 100)=0 then # print(E): #fi: od: #print(`error is `,E): #print(count): return [E,evalf(Logi(w,x))]: end: #Logi(w,d): calculates logistic(w[0]+w[1]*d[1]+...+w[m]*d[m]), where logistic(x)=1/(1+e^(-x)) #w: the current set of weights (a table) #d: the values of the descriptive features of a data point Logi:=proc(w,d): return evalf(1/(1+exp(-(w[0]+add(w[i]*d[i], i=1..(nops(op(op(w)))-1)))))): end: #ED(D,w,j): calculates the error delta for multivariable logistic regression #D: the data set #w: the current vector of weights #j: the w[j] we want to update ED:=proc(D,w,j) local i, tempSum, tempLogi, relevantDesc: tempSum:=0: for i from 1 to nops(D) do #j=0 case is different if j=0 then relevantDesc:=1: else relevantDesc:=D[i][1][j]: fi: tempLogi:=Logi(w,D[i][1]): tempSum:=tempSum + (D[i][2]-tempLogi)*tempLogi*(1-tempLogi)*relevantDesc: od: return tempSum: end: ######methods for our particular application #PrepData(D,Descs,Target,k): extracts the specified descriptive features and target features from the data set D, setting the new target feature to be 1 if its value is >=k and 0 otherwise #puts into format [[descriptive],target] #D: the data set in its original form #Descs: a list of the features that will be target features (indices of the lists in D) #Target: the index of what will be the target feature #k: we will assign a data point 1 if the value of the target feature is >=k PrepData:=proc(D, Descs, Target, k) local NewD, i, n: #extract only the information we need NewD:=ExtractFields(D,[op(Descs),Target]): n:=nops(NewD[1]): #reformat so that the target feature is separate NewD:=[seq([[op(1..n-1, NewD[i])],Conv(NewD[i][n], k)], i=1..nops(D))]: end: #Conv(r,k): used in PrepData. If r>=k, return 1; else, return 0. Conv:=proc(r,k): if r>=k then return 1: fi: return 0: end: #Normalize(D,L) normalizes the data set D according to the list L of bounds #D: the data set in the form [[Descriptive],Target] as given by PrepData #L: the list of approximate bounds by which to divide the descriptive features Normalize:=proc(D,L) local i,j: [seq([[seq(evalf(D[i][1][j]/L[j]), j=1..nops(L))],D[i][2]], i=1..nops(D))]: end: #NormPrep(D,Descs,Target,k,L): extracts the specified descriptive features and target features from the data set D, setting the new target feature to be 1 if its value is >=k and 0 otherwise #puts into format [[descriptive],target] #normalizes according to the list L, dividing the values of descriptive feature i by L[i] #D: the data set in its original form #Descs: a list of the features that will be target features (indices of the lists in D) #Target: the index of what will be the target feature #k: we will assign a data point 1 if the value of the target feature is >=k #L: the list of approximate bounds by which to divide the descriptive features NormPrep:=proc(D,Descs,Target,k,L): Normalize(PrepData(D,Descs,Target,k),L): end: