function [J,G] = objFuncLR(w,X,y,alpha,lambda,we) % Logistic regression objective function and gradient % % Inputs: % % w - vector of model parameters. In two-category case, this % is (d+1)-dimensional vector, where d is the number of variables in the % data. In multicategory case, this is M*(d+1)-dimensional vector where % M is the number of categories % % X - n*d data matrix with n observations and d variables % % y - binary coded category labels. In two category case, this is n-dimensional % vector of zeros and ones. In multicategory case, this is n*M matrix % of zeros and ones (use "codeCategories.m" to convert labels to this format) % % alpha - controls the extent of L1- and L2-regularization. % alpha = 1 means pure L1-regularization % alpha = 0 means pure L2-regularization % 0 < alpha < 1 means combination of L1- and L2-regularization % % lambda - controls the extent of regularization with respect to log-likelihood % lambda = 0 means that regularization is not used at all, and increase of lambda % increases the extent of regularization. % % we - optional parameter to weight model parameters. Do not use in the current version. % % Outputs: % % J - value of objective function % % G -gradient of the objective function % % % % Jukka-Pekka Kauppi % University of Helsinki, Department of Computer Science % % History: % Multinomial logreg implemented 22.8.2012 % Binomial logreg implemented and comments added 23.8.2012 if nargin == 3 alpha = 0; lambda = 1; end n = size(X,1); d = size(X,2); M = size(y,2); % in two-category case, make sure that labels are given as column vector: if min(size(y)) == 1 y = y(:); M = 2; end % add column of ones to include bias to scalar products: X = [X ones(size(X,1),1)]; if M > 2 % multi-category case: % order weights category-wise for easier handling: w = reshape(w,(d+1),M); % w = w.*we; % optional: "we" can be use to weight parameter q_tm = zeros(n,M); %n*M for m = 1:M q_tm(:,m) = X*w(:,m); end R = sum(exp(q_tm),2); % n*1; Ptm = exp(q_tm)./repmat(R,1,M); % n*M % COMPUTE GRADIENT: % gradient of log-likelihood: dw = zeros(d+1,M); for m = 1:M dw(:,m) = sum(X.*(repmat(y(:,m),1,d+1) - repmat(Ptm(:,m),1,d+1)),1)'; end % gradient of penalty: der_penalty = lambda*( alpha*( [sign(w(:))] ) + (1-alpha)*[w(:)] ); % dw = dw.*we; % complete gradient: G = dw(:) - der_penalty; % flip signs because objective function sign is flipped (see below): G = -1*G; % COMPUTE OBJECTIVE FUNCTION: % log-likelihood: term1 = sum(y.*q_tm,2); term2 = -log(R); J = sum( term1 + term2 ); % penalty term (prior): penalty = lambda*( alpha*( sum( abs(w(:)) ) ) + 0.5*(1-alpha)*( sum( w(:).^2 ) ) ); % complete objective function: J = J - penalty; % convert maximization to minimization problem: J = -1*J; % disp(num2str(J)) % optional: display objective function value else % two-category case: q_t = X*w; exp_q_t = exp(q_t); R = 1 + exp_q_t; P_t = exp_q_t ./ R; % COMPUTE OBJECTIVE FUNCTION: % log-likelihood: J = sum( (y.*q_t) - log(R) ); % penalty term (prior): penalty = lambda*( alpha*( sum( abs(w) ) ) + 0.5*(1-alpha)*( sum( w.^2 ) ) ); % complete objective function: J = J - penalty; % convert maximization to minimization problem: J = -1*J; % COMPUTE GRADIENT: % gradient of log-likelihood: dw = sum(X.*(repmat(y,1,d+1) - repmat(P_t,1,d+1)), 1)'; % gradient of penalty: der_penalty = lambda*( alpha*( [sign(w)] ) + (1-alpha)*w ); % dw = dw.*we; % complete gradient: G = dw - der_penalty; % flip signs because objective function sign is flipped: G = -1*G; end