%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%标签:lfts,end,%%,Qlearning,控制算法,0.5,trial,matlab,exp From: https://blog.51cto.com/u_15815923/5744748
clc;clear all
close all
% flops(0);
%hold off
%%=====================================================
%% Initialize parameters
Ang2Rad=pi/180;
Rad2Ang=180/pi;
alpha=0.9; %discount factor
Initlc=0.3; %initial learning rate for critic
Initla=0.3; %initial learning rate for action
Ta = 0.005;
Tc = 0.05;
FailDeg = 12.0;
FailTheta = (FailDeg*Ang2Rad);
Boundary = 2.4;
Mag = 10; % Control force magnitude
WA_Inputs = 4;
WC_Inputs = 5;
INIT_THETA = ((rand-0.5)*24*Ang2Rad); % (rad) was 2.0
INIT_THETADOT = (0.0*Ang2Rad); % (rad/s) was 2.0
INIT_X = 0.0; % (m) was 0.25
INIT_XDOT = 0.0; % (m/s) was 0.25
inistate = [INIT_THETA INIT_THETADOT INIT_X INIT_XDOT];
NF_Theta = FailTheta;
NF_ThetaDot = (120.0*Ang2Rad);
NF_x = Boundary;
NF_xDot = 1.5;
NF = [NF_Theta NF_ThetaDot NF_x NF_xDot];
Ncrit = 50;
Nact = 100;
tstep=0.02;
% Maximum time assumed the pole will balanced forever,the unit is seconds
MaxTime = 120; %(s) was 120 seconds
Tit=MaxTime/tstep;
%Desired number of trials
MaxTr=1000;
%Number of nodes in hidden layer
N_Hidden = 6;
%Objective value of the cost function
Uc=0;
%初始化
lfts_history = 6;
max_stuck_count = 5;
max_lfts_diff = 5;
trial_state=zeros(lfts_history,2);
%初始化网络权值
wc1=(rand(WC_Inputs,N_Hidden)-0.5)*2;
wc2=(rand(N_Hidden,1)-0.5)*2;
wa1=(rand(WA_Inputs,N_Hidden)-0.5)*2;
wa2=(rand(N_Hidden,1)-0.5)*2;
%开始训练
for trial=1:MaxTr, %外部循环开始
count=0;
failure=0;
failReason=0;
lfts = 1;
newSt = inistate;
inputs = newSt./NF;
lc = Initlc;
la = Initla;
xhist=newSt;
%计算newAction
ha = inputs*wa1;
g = (1 - exp(-ha))./(1 + exp(-ha));
va = g*wa2;
newAction = (1 - exp(-va))./(1 + exp(-va));
%计算J
inp=[inputs newAction];
qc=inp*wc1;
p = (1 - exp(-qc))./(1 + exp(-qc));
J=p*wc2;
Jprev = J;
while(lfts<Tit), %内部循环开始
if (rem(lfts,500)==0),
disp(['It is ' int2str(lfts) ' time steps now......']);
end
%生成控制信号
if (newAction >= 0)
sgnf = 1;
else
sgnf = -1;
end
u = Mag*sgnf; %bang-bang control
%Plug in the model
[T,Xf]=ode45('cartpole_model',[0 tstep],newSt,[],u);
a=size(Xf);
newSt=Xf(a(1),:);
inputs=newSt./NF; %input normalization
%计算newAction
ha = inputs*wa1;
g = (1 - exp(-ha))./(1 + exp(-ha));
va = g*wa2;
newAction = (1 - exp(-va))./(1 + exp(-va));
%calculate new J
inp=[inputs newAction];
qc=inp*wc1;
p = (1 - exp(-qc))./(1 + exp(-qc));
J=p*wc2;
xhist=[xhist;newSt];
%%===========================================================%%
%%求取强化信号r(t),即reinf %%
%%===========================================================%%
if (abs(newSt(1)) > FailTheta)
reinf = 1;
failure = 1;
failReason = 1;
elseif (abs(newSt(3)) > Boundary)
reinf = 1;
failure = 1;
failReason = 2;
else
reinf = 0;
end
%%================================%%
%% learning rate update scheme %%
%%================================%%
if (rem(lfts,5)==0)
lc = lc - 0.05;
la = la - 0.05;
end
if (lc<0.01)
lc=0.005;
end
if (la<0.01)
la=0.005;
end
%%================================================%%
%% internal weights updating cycles for critnet %%
%%================================================%%
cyc = 0;
ecrit = alpha*J-(Jprev-reinf);
Ec = 0.5 * ecrit^2;
while (Ec>Tc & cyc<=Ncrit),
gradEcJ=alpha*ecrit;
%----for the first layer(input to hidden layer)-----------
gradqwc1 = [inputs'; newAction];
for i=1:N_Hidden,
gradJp = wc2(i);
gradpq = 0.5*(1-p(i)^2);
wc1(:,i) = wc1(:,i) - lc*gradEcJ*gradJp*gradpq*gradqwc1;
end
%----for the second layer(hidden layer to output)-----------
gradJwc2=p';
wc2 = wc2- lc*gradEcJ*gradJwc2;
%----compute new J----
inp=[inputs newAction];
qc=inp*wc1;
p = (1 - exp(-qc))./(1 + exp(-qc));
J=p*wc2;
cyc = cyc +1;
ecrit = alpha*J-(Jprev-reinf);
Ec = 0.5 * ecrit^2;
end % end of "while (Ec>0.05 & cyc<=Ncrit)"
%normalization weights for critical network
if (max(max(abs(wc1)))>1.5)
wc1=wc1/max(max(abs(wc1)));
end
if max(max(abs(wc2)))>1.5
wc2=wc2/max(max(abs(wc2)));
end
%%=============================================%%
%% internal weights updating cycles for actnet %%
%%=============================================%%
cyc = 0;
eact = J - Uc;
Ea = 0.5*eact^2;
while (Ea>Ta & cyc<=Nact),
graduv = 0.5*(1-newAction^2);
gradEaJ = eact;
gradJu = 0;
for i=1:N_Hidden,
gradJu = gradJu + wc2(i)*0.5*(1-p(i)^2)*wc1(WC_Inputs,i);
end
%----for the first layer(input to hidden layer)-----------
for (i=1:N_Hidden),
gradvg = wa2(i);
gradgh = 0.5*(1-g(i)^2);
gradhwa1 = inputs';
wa1(:,i)=wa1(:,i)-la*gradEaJ*gradJu*graduv*gradvg*gradgh*gradhwa1;
end
%----for the second layer(hidden layer to output)-----------
gradvwa2 = g';
wa2=wa2-la*gradEaJ*gradJu*graduv*gradvwa2;
%----compute new J and newAction-------
ha = inputs*wa1;
g = (1 - exp(-ha))./(1 + exp(-ha));
va = g*wa2;
newAction = (1 - exp(-va))./(1 + exp(-va));
inp=[inputs newAction];
qc=inp*wc1;
p = (1 - exp(-qc))./(1 + exp(-qc));
J=p*wc2;
cyc = cyc+1;
eact = J - Uc;
Ea = 0.5*eact^2;
end %end of "while (Ea>Ta & cyc<=Nact)"
if ~failure
Jprev=J;
else
break; %another trial 即跳出“while(lfts<Tit),”
end
lfts=lfts+1;
end %end of "while(lfts<Tit)" 结束内部循环
msgstr1=['Trial # ' int2str(trial) ' has ' int2str(lfts) ' time steps.'];
msgstr21=['Trial # ' int2str(trial) ' has successfully balanced for at least '];
msgstr22=[msgstr21 int2str(lfts) ' time steps '];
if ~failure %如果成功
disp(msgstr22);
for i=1:1000,
Ang(i,1)=xhist(i,1)*Rad2Ang;
Dis(i,1)=xhist(i,3);
end
subplot(2,1,1);
plot(Ang(:,1));
title('Angle(Degree)');
grid on;
subplot(2,1,2);
plot(Dis(:,1));
title('Distance(Meter)');
grid on;
break;
else %如果失败
disp(msgstr1);
end %end of “if ~failure”
if (rem(trial,100)==0 & trial~=MaxTr)
msgstr3=['Press any key to continue...'];
disp(msgstr3);
end
%======================================================================
% weights in and compare to previously stored old values
%======================================================================
if trial~=MaxTr
for s=1:lfts_history-1
trial_state(s,:) = trial_state(s+1,:);
end
trial_state(lfts_history,:) = [lfts,(~failure)];
for i=1:lfts_history-1
if (abs(trial_state(i,1)-trial_state(i+1,1))<=max_lfts_diff)
count=count+1;
end
end
if count==max_stuck_count
%%===========================================================%%
%% initialize weights for crit network and action network. %%
%%===========================================================%%
wc1=(rand(WC_Inputs,N_Hidden)-0.5)*2;
wc2=(rand(N_Hidden,1)-0.5)*2;
wa1=(rand(WA_Inputs,N_Hidden)-0.5)*2;
wa2=(rand(N_Hidden,1)-0.5)*2;
end
end % end of “if trial~=MaxTr”
end % end of for trials 结束外部循环
fprintf('\nRun successful at trial #%d\n\n',trial);
D-027