Contents

function [M,W,tau,alpha,rewardOverTime ] = GrouPS(bestSize,newSamples, sampleSize, nmbrGroups, DimPerGroup, latentDim, featureDim, Time, createSamplesFunc, evaluateFunc, initialM, maxNumberOfIterations, maxNumberOfInnerIterations, sigma2_M, antiConvergenceFactor, tauA, tauB, alphaA, alphaB, initialTau, initialAlpha)

GrouPS

The Group Factor Policy Search algorithm for the determination of the parameter matrix in policy search problems with fixed horizons.

Example of createSampleFunc

This section states a short example and gives further information for the parameter function createSampleFunc

[Actions, Basisfunctions, Reward, Zvals] = createSamplesFunc(W,M,tau)

The function is called with the current estimated parameters W, M and tau. The parameters W,M and tau are cell arrays of the size nmbrGroups $\times$ 1. For each W{n} has the size of DimPerGroup(n) $\times$ latentDim and consists of the corresponding sub-matrix of W. Each M{n} contains a submatrix of the size DimPerGroup(n) $\times$ featureDim . The cells of tau contains only a scalar representing the tau-value of the given group.

The return value Actions is a $\sum_n DimPerGroup(n) \times Time$ matrix with columns corresponding to the time steps containing the action vectors. Actions(:,1) is the action vector for the first time step, Actions(:,2) for the second and so on. The matrix Basisfunctions has the size featureDim $\times$ Time and each column consists of the feature vector for the corresponding time step. The array Reward has the size Time and contains a reward for each time step. It can consists of the same reward for all time steps in the case of a single reward, one may may use ones(1,n) * reward /n . The matrix Zvals has the size latentDim $\times$ Time . Each column corresponds to a random vector given by $N(\mathbf{0}, trace(\Phi \Phi^T) \mathbf{I} )$ .

evaluateFunc

The evaluateFun parameter is a function with the same parameters and output as createSampleFunc given above. This function is only called at the end of an iteration for the computation of the current reward. This reward will be stored in the output array rewardOverTime. The function evaluateFunc itself may or may ot use the parameters tau and W.

initialM

The parameter contains the initial M matrix. It is given by a cell array with a cell for each group. An example is given below.

initialM = cell(nmbrGroups,1);
for m=1:nmbrGroups
    initialM{m} = zeros(DimPerGroup(m),featureDim);
end
initialM{1}(1,:) = 30;
initialM{3}(1,:) = 150;

Check some parameters

if newSamples > sampleSize
    warning('More samples are generated than needed. Value of newSamples is to high. newSamples is now set to sampleSize!')
    newSamples = sampleSize;
end
if bestSize > sampleSize
    warning('There are not enough samples to keep. Value of bestSize is to high. Set bestSize to the value of sampleSize')
    bestSize = sampleSize;
end
if nmbrGroups ~= max(size(DimPerGroup))
    warning('The value of nmbrGroups does not match the actual numbers of groups. Value is set to the actual number')
    nmbrGroups = length(DimPerGroup);
end
if latentDim > sum(DimPerGroup)
    warning('Number of latent dimensions is actually higher than the dimensionality')
end
Not enough input arguments.

Error in GrouPS (line 99)
if newSamples > sampleSize

Initialization

numberIterations = 0;
numberInnerIterations=0;

rewardPlot = [];
notConverged = true;
notConverged2 = true;

W = cell(nmbrGroups,1);
M = cell(nmbrGroups,1);
tau = cell(nmbrGroups,1);
alpha = cell(nmbrGroups,1);
for m=1:nmbrGroups
    W{m} = randn(DimPerGroup(m),latentDim);
    tau{m} = initialTau;
    alpha{m} = initialAlpha;
end
M = initialM;

rewardPlot = evaluateFunc(W,M,tau);
while notConverged
    notConverged2 = true;
    % This is the part where I want to create trajectories
    sumOfRewards = 0;
    if newSamples == sampleSize || numberIterations == 0
            Realization = cell(sampleSize,1);
            Z_dist = cell(sampleSize,1);
        for r=1:sampleSize
            % maybe store also z for initial values?
            [Actions, Basisfunctions, Reward, Zvals] = createSamplesFunc(W,M,tau); %createSampleStepsTwoArms(W,M,tau);
            Realization{r} = struct('Actions',Actions,'Basis',Basisfunctions,'Reward',Reward);
            Z_dist{r} = cell(Time,1);
            for t=1:Time
                Z_dist{r}{t} = struct('Mean',Zvals(:,t),'Cov',eye(latentDim));
            end
        end
    else
        index = randperm(sampleSize);
        for r=index(1:newSamples)
            % maybe store also z for initial values?
            [Actions, Basisfunctions, Reward, Zvals] = createSamplesFunc(W,M,tau); %createSampleStepsTwoArms(W,M,tau);
            Realization{r} = struct('Actions',Actions,'Basis',Basisfunctions,'Reward',Reward);
            Z_dist{r} = cell(Time,1);
            for t=1:Time
                Z_dist{r}{t} = struct('Mean',Zvals(:,t),'Cov',eye(latentDim));
            end
        end
    end
    clear r

    %Choose the best ones by a simple sum, proportional to mean
    for r=1:sampleSize
        reward_tmp(r) = sum(Realization{r}.Reward);
    end
    [~, Idx] = sort(reward_tmp,'descend');
    relevantSamplesIdx = Idx(1:bestSize);
    sumOfRewards = 0;
    for r=relevantSamplesIdx
        sumOfRewards = sumOfRewards + sum(Realization{r}.Reward);
    end

    clear reward_tmp Idx

    W_dist = cell(nmbrGroups,1);
    M_dist = cell(nmbrGroups,1);
    tau_dist = cell(nmbrGroups,1);
    alpha_dist =cell(nmbrGroups,1);

    for m=1:nmbrGroups
        W_dist{m} = cell(DimPerGroup(m),1);
        M_dist{m} = cell(DimPerGroup(m),1);
        alpha_dist{m} = cell(latentDim,1);

        for j=1:DimPerGroup(m)
            W_dist{m}{j} = struct('Mean',W{m}(j,:),'Cov', eye(latentDim));
            M_dist{m}{j} = struct('Mean',M{m}(j,:),'Cov',eye(featureDim));
        end
        tau_dist{m} = struct('A', tauA + 0.5 * DimPerGroup(m) * Time, 'B', tauB + 0.5 * DimPerGroup(m) * Time);
        for k=1:latentDim
            alpha_dist{m}{k} = struct('A', alphaA + DimPerGroup(m)/2, 'B', alphaB + DimPerGroup(m)/2);
        end
    end
    clear Actions Basisfunctions Reward Zvals r t j m

    numberInnerIterations = 0;
    while notConverged2
        M_old_dist = M_dist;

Compute ditsribution of M

        %Compute the Variance
            %Static Part
            Sigma_Phi = zeros(featureDim);
            for r=relevantSamplesIdx
                Basis = Realization{r}.Basis;
                reward = Realization{r}.Reward;
                for t=1:Time
                    Phi = Basis(:,t);
                    Sigma_Phi = Sigma_Phi + (Phi*Phi')/(Phi'*Phi) .* reward(t);
                end
            end
            %Normalization
            Sigma_Phi = Sigma_Phi/sumOfRewards;
            %Dynamic Part
            for m=1:nmbrGroups
                tau_Ew=(tau_dist{m}.A / tau_dist{m}.B);
                Sigma = inv((Sigma_Phi * tau_Ew) ...
                                        + eye(featureDim) /sigma2_M);
                for j=1:DimPerGroup(m)
                    M_dist{m}{j}.Cov = Sigma;
                end
            end
            clear Sigma Sigma_Phi r t m j Basis reward Phi tau_Ew;

        %Compute Mean of distribution of M
        for m=1:nmbrGroups
            startDim = sum(DimPerGroup(1:(m-1)));
            for j=1:DimPerGroup(m)
                mean = zeros(featureDim,1);
                for r=relevantSamplesIdx
                    for t=1:Time
                        Phi = Realization{r}.Basis(:,t);
                        a = Realization{r}.Actions(startDim+j,t);
                        reward = Realization{r}.Reward;
                        tau_m_Erw = tau_dist{m}.A / tau_dist{m}.B;
                        mean = mean + Phi * ( a - W_dist{m}{j}.Mean * Z_dist{r}{t}.Mean)...
                            * (tau_m_Erw) ...
                            / (Phi' * Phi) * reward(t);
                    end
                end
                mean = mean/sumOfRewards;
                mean = mean + M{m}(j,:)'/sigma2_M;
                mean = M_dist{m}{j}.Cov * mean;
                M_dist{m}{j}.Mean = mean';

            end
        end
        clear mean m j r t a Phi reward startDim tau_m_Erw

Compute Distribution of W

        %Compute the Variance of W
        %Static part
        Sigma = zeros(latentDim);
        for r=relevantSamplesIdx
            reward = Realization{r}.Reward;
            Basis = Realization{r}.Basis;
            for t=1:Time
                Sigma = Sigma + reward(t) * (Z_dist{r}{t}.Mean * Z_dist{r}{t}.Mean' + Z_dist{r}{t}.Cov) ...
                    /(Basis(:,t)' * Basis(:,t));
            end
        end
        Sigma = Sigma ./ sumOfRewards;
        clear r t reward Basis;

        for m=1:nmbrGroups
            alpha_mk = zeros(latentDim,1);
            for k=1:latentDim
                alpha_mk(k) = alpha_dist{m}{k}.A/alpha_dist{m}{k}.B;
            end
            alpha_mk = diag(alpha_mk);
            Sigma_tmp = inv(Sigma *(tau_dist{m}.A / tau_dist{m}.B) + alpha_mk);
            for j=1:DimPerGroup(m)
                W_dist{m}{j}.Cov = Sigma_tmp;
            end
        end
        clear m alpha_mk Sigma Sigma_tmp j k
        %Compute W Mean

        for m=1:nmbrGroups
            startDim = sum(DimPerGroup(1:(m-1)));
            for j=1:DimPerGroup(m)
                mean = zeros(latentDim,1);
                m_mj = M_dist{m}{j}.Mean;
                for r=relevantSamplesIdx
                    reward = Realization{r}.Reward;
                    action = Realization{r}.Actions;
                    Basis = Realization{r}.Basis;
                    tau_m_Erw = tau_dist{m}.A/ tau_dist{m}.B;
                    for t=1:Time
                        mean = mean + (reward(t) * ...
                            (action(startDim+j,t) - m_mj * Basis(:,t)) * Z_dist{r}{t}.Mean' ...
                            * (tau_m_Erw)...
                            /(Basis(:,t)' * Basis(:,t)))';
                    end
                end
                mean = mean ./ sumOfRewards;
                mean = W_dist{m}{j}.Cov * mean;
                W_dist{m}{j}.Mean = mean';
            end
        end
        clear m j r t mean Basis action reward m_mj tau_m_Erw

Compute Distribution of Z

        %Compute the static part
        W_static = zeros(latentDim,latentDim);
        W_static_mean = cell(nmbrGroups,1);
        M_static_mean = cell(nmbrGroups,1);

        for m=1:nmbrGroups
            W_m_mean = zeros(DimPerGroup(m),latentDim);
            M_m_mean = zeros(DimPerGroup(m),featureDim);
            tau_tmp = tau_dist{m}.A/tau_dist{m}.B;
            for j=1:DimPerGroup(m)
                W_m_mean(j,:) = W_dist{m}{j}.Mean;
                M_m_mean(j,:) = M_dist{m}{j}.Mean;
            end
            W_static_mean{m} = W_m_mean;
            M_static_mean{m} = M_m_mean;
            W_static = W_static + (DimPerGroup(m) *  W_dist{m}{1}.Cov + W_m_mean'*W_m_mean) * tau_tmp;
        end
        clear W_m_mean m j M_m_mean tau_tmp

        for r=relevantSamplesIdx
            Basis = Realization{r}.Basis;
            Action = Realization{r}.Actions;
            for t=1:Time
                Z_dist{r}{t}.Cov = inv(Basis(:,t)' * Basis(:,t) \ (eye(latentDim) + W_static));
                mean = zeros(latentDim,1);
                for m=1:nmbrGroups
                    mean = mean + (W_static_mean{m}' * (Action((1+sum(DimPerGroup(1:(m-1)))):(sum(DimPerGroup(1:m))) ,t) - M_static_mean{m} * Basis(:,t)))...
                        /(Basis(:,t)' * Basis(:,t)) * (tau_dist{m}.A/tau_dist{m}.B);
                end
                Z_dist{r}{t}.Mean = Z_dist{r}{t}.Cov * mean;
            end
        end
        clear Basis r t W_static_mean M_static_mean W_static Action mean

Distribution of Tau

        %The A parmater of the Tau distribution remains the same, so we
        %have just to compute the B parameter, which is still quite a lot
        %of work
        for m=1:nmbrGroups
            M_tmp = zeros(DimPerGroup(m),featureDim);
            W_tmp = zeros(DimPerGroup(m),latentDim);

            for j=1:DimPerGroup(m)
                M_tmp(j,:) = M_dist{m}{j}.Mean;
                W_tmp(j,:) = W_dist{m}{j}.Mean;
            end


            Bm = 0;
            for r=relevantSamplesIdx
                reward = Realization{r}.Reward;
                basis = Realization{r}.Basis;
                actions = Realization{r}.Actions;
                dims_start = sum(DimPerGroup(1:(m-1)))+1;

                for t=1:Time
                    actions_t = actions(dims_start:(dims_start + DimPerGroup(m) -1),t);
                    Exp_Wt_W = DimPerGroup(m) * W_dist{m}{1}.Cov + W_tmp'*W_tmp;
                    Bm = Bm + reward(t) / (basis(:,t)' * basis(:,t)) * ...
                        (...
                            actions_t' * actions_t + ...
                            -2 * actions_t' * M_tmp * basis(:,t) + ...
                            2 * Z_dist{r}{t}.Mean' * W_tmp' * M_tmp * basis(:,t)+...
                            -2 *actions_t' * W_tmp * Z_dist{r}{t}.Mean + ...
                            basis(:,t)' * (DimPerGroup(m) * M_dist{m}{1}.Cov + M_tmp' * M_tmp) * basis(:,t) + ...
                            trace(Exp_Wt_W * Z_dist{r}{t}.Cov) + ...
                            Z_dist{r}{t}.Mean' * Exp_Wt_W * Z_dist{r}{t}.Mean...
                        );
                end
            end
            Bm = Bm/sumOfRewards;
            Bm = Bm/2;
            tau_dist{m}.B = tauB + Bm;
        end
        clear Bm basis reward M_tmp W_tmp dims_start actions_t Exp_Wt_W actions

Compute alpha Distribution

        % Assume special case for alpha with gamma distribution

        % Only the B parameter changes
        for m=1:nmbrGroups
            W_tmp = zeros(DimPerGroup(m),latentDim);
            sum_variances = zeros(1,latentDim);
            for j=1:DimPerGroup(m)
                W_tmp(j,:) = W_dist{m}{j}.Mean;
                for k=1:latentDim
                    sum_variances(k) = sum_variances(k) + W_dist{m}{j}.Cov(k,k);
                end
            end
            for k=1:latentDim
                alpha_dist{m}{k}.B = alphaB + W_tmp(:,k)'*W_tmp(:,k)/2 ...;
                    +sum_variances(k)/2 ...
                    ;
            end
        end
        clear W_tmp startDim

Check for convergence

        valueNorm_M_mean = 0;
        for m=1:nmbrGroups
            for j=1:DimPerGroup(m)
                valueNorm_M_mean = valueNorm_M_mean + norm(M_old_dist{m}{j}.Mean - M_dist{m}{j}.Mean,2);
            end
        end
        notConverged2 = valueNorm_M_mean > 0.01;
        fprintf('%i \t %f\n', numberInnerIterations, valueNorm_M_mean);
        if numberInnerIterations >= maxNumberOfInnerIterations
            notConverged2 = 0;
        else
            numberInnerIterations = numberInnerIterations + 1;
        end
    end

    for m=1:nmbrGroups
        for j=1:DimPerGroup(m)
            M{m}(j,:) =  M_dist{m}{j}.Mean;
            W{m}(j,:) = antiConvergenceFactor * W_dist{m}{j}.Mean;
        end
        tau{m} = antiConvergenceFactor\ (tau_dist{m}.A/tau_dist{m}.B);
        for k=1:latentDim
            alpha{m}(k) = alpha_dist{m}{k}.A/alpha_dist{m}{k}.B;
        end
    end

Very simple convergence validation

    numberIterations = numberIterations + 1;
    notConverged = numberIterations < maxNumberOfIterations;

Evaluate the current solution

    [~,~,rewTmp,~] = evaluateFunc(W,M,tau);
    rewardPlot(numberIterations+1) = rewTmp;
    clear rewTmp;

Print the current iteraton

    fprintf('\t Iteration %i\n',numberIterations);
end

rewardOverTime = rewardPlot;
end