首页 > 编程语言 >基于Qlearning的倒立摆控制算法matlab程序

基于Qlearning的倒立摆控制算法matlab程序

时间:2022-10-10 16:31:31浏览次数:63  
标签:lfts end %% Qlearning 控制算法 0.5 trial matlab exp

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
clc;clear all
close all
% flops(0);
%hold off

%%=====================================================
%% Initialize parameters
Ang2Rad=pi/180;
Rad2Ang=180/pi;

alpha=0.9; %discount factor
Initlc=0.3; %initial learning rate for critic
Initla=0.3; %initial learning rate for action
Ta = 0.005;
Tc = 0.05;

FailDeg = 12.0;
FailTheta = (FailDeg*Ang2Rad);
Boundary = 2.4;

Mag = 10; % Control force magnitude

WA_Inputs = 4;
WC_Inputs = 5;

INIT_THETA = ((rand-0.5)*24*Ang2Rad); % (rad) was 2.0
INIT_THETADOT = (0.0*Ang2Rad); % (rad/s) was 2.0
INIT_X = 0.0; % (m) was 0.25
INIT_XDOT = 0.0; % (m/s) was 0.25
inistate = [INIT_THETA INIT_THETADOT INIT_X INIT_XDOT];

NF_Theta = FailTheta;
NF_ThetaDot = (120.0*Ang2Rad);
NF_x = Boundary;
NF_xDot = 1.5;
NF = [NF_Theta NF_ThetaDot NF_x NF_xDot];

Ncrit = 50;
Nact = 100;
tstep=0.02;

% Maximum time assumed the pole will balanced forever,the unit is seconds
MaxTime = 120; %(s) was 120 seconds
Tit=MaxTime/tstep;

%Desired number of trials
MaxTr=1000;

%Number of nodes in hidden layer
N_Hidden = 6;

%Objective value of the cost function
Uc=0;

%初始化
lfts_history = 6;
max_stuck_count = 5;
max_lfts_diff = 5;
trial_state=zeros(lfts_history,2);

%初始化网络权值
wc1=(rand(WC_Inputs,N_Hidden)-0.5)*2;
wc2=(rand(N_Hidden,1)-0.5)*2;

wa1=(rand(WA_Inputs,N_Hidden)-0.5)*2;
wa2=(rand(N_Hidden,1)-0.5)*2;
%开始训练
for trial=1:MaxTr, %外部循环开始
count=0;
failure=0;
failReason=0;
lfts = 1;
newSt = inistate;
inputs = newSt./NF;
lc = Initlc;
la = Initla;

xhist=newSt;

%计算newAction
ha = inputs*wa1;
g = (1 - exp(-ha))./(1 + exp(-ha));
va = g*wa2;
newAction = (1 - exp(-va))./(1 + exp(-va));
%计算J
inp=[inputs newAction];
qc=inp*wc1;
p = (1 - exp(-qc))./(1 + exp(-qc));
J=p*wc2;

Jprev = J;

while(lfts<Tit), %内部循环开始

if (rem(lfts,500)==0),
disp(['It is ' int2str(lfts) ' time steps now......']);
end

%生成控制信号
if (newAction >= 0)
sgnf = 1;
else
sgnf = -1;
end
u = Mag*sgnf; %bang-bang control

%Plug in the model
[T,Xf]=ode45('cartpole_model',[0 tstep],newSt,[],u);
a=size(Xf);
newSt=Xf(a(1),:);
inputs=newSt./NF; %input normalization

%计算newAction
ha = inputs*wa1;
g = (1 - exp(-ha))./(1 + exp(-ha));
va = g*wa2;
newAction = (1 - exp(-va))./(1 + exp(-va));
%calculate new J
inp=[inputs newAction];
qc=inp*wc1;
p = (1 - exp(-qc))./(1 + exp(-qc));
J=p*wc2;

xhist=[xhist;newSt];

%%===========================================================%%
%%求取强化信号r(t),即reinf %%
%%===========================================================%%

if (abs(newSt(1)) > FailTheta)
reinf = 1;
failure = 1;
failReason = 1;
elseif (abs(newSt(3)) > Boundary)
reinf = 1;
failure = 1;
failReason = 2;
else
reinf = 0;
end

%%================================%%
%% learning rate update scheme %%
%%================================%%

if (rem(lfts,5)==0)
lc = lc - 0.05;
la = la - 0.05;
end

if (lc<0.01)
lc=0.005;
end

if (la<0.01)
la=0.005;
end

%%================================================%%
%% internal weights updating cycles for critnet %%
%%================================================%%

cyc = 0;
ecrit = alpha*J-(Jprev-reinf);
Ec = 0.5 * ecrit^2;
while (Ec>Tc & cyc<=Ncrit),
gradEcJ=alpha*ecrit;
%----for the first layer(input to hidden layer)-----------
gradqwc1 = [inputs'; newAction];
for i=1:N_Hidden,
gradJp = wc2(i);
gradpq = 0.5*(1-p(i)^2);
wc1(:,i) = wc1(:,i) - lc*gradEcJ*gradJp*gradpq*gradqwc1;
end
%----for the second layer(hidden layer to output)-----------
gradJwc2=p';
wc2 = wc2- lc*gradEcJ*gradJwc2;
%----compute new J----
inp=[inputs newAction];
qc=inp*wc1;
p = (1 - exp(-qc))./(1 + exp(-qc));
J=p*wc2;

cyc = cyc +1;
ecrit = alpha*J-(Jprev-reinf);
Ec = 0.5 * ecrit^2;
end % end of "while (Ec>0.05 & cyc<=Ncrit)"

%normalization weights for critical network
if (max(max(abs(wc1)))>1.5)
wc1=wc1/max(max(abs(wc1)));
end
if max(max(abs(wc2)))>1.5
wc2=wc2/max(max(abs(wc2)));
end

%%=============================================%%
%% internal weights updating cycles for actnet %%
%%=============================================%%
cyc = 0;
eact = J - Uc;
Ea = 0.5*eact^2;
while (Ea>Ta & cyc<=Nact),
graduv = 0.5*(1-newAction^2);
gradEaJ = eact;
gradJu = 0;
for i=1:N_Hidden,
gradJu = gradJu + wc2(i)*0.5*(1-p(i)^2)*wc1(WC_Inputs,i);
end
%----for the first layer(input to hidden layer)-----------
for (i=1:N_Hidden),
gradvg = wa2(i);
gradgh = 0.5*(1-g(i)^2);
gradhwa1 = inputs';
wa1(:,i)=wa1(:,i)-la*gradEaJ*gradJu*graduv*gradvg*gradgh*gradhwa1;
end
%----for the second layer(hidden layer to output)-----------
gradvwa2 = g';
wa2=wa2-la*gradEaJ*gradJu*graduv*gradvwa2;
%----compute new J and newAction-------
ha = inputs*wa1;
g = (1 - exp(-ha))./(1 + exp(-ha));
va = g*wa2;
newAction = (1 - exp(-va))./(1 + exp(-va));

inp=[inputs newAction];
qc=inp*wc1;
p = (1 - exp(-qc))./(1 + exp(-qc));
J=p*wc2;

cyc = cyc+1;
eact = J - Uc;
Ea = 0.5*eact^2;
end %end of "while (Ea>Ta & cyc<=Nact)"

if ~failure
Jprev=J;
else
break; %another trial 即跳出“while(lfts<Tit),”
end
lfts=lfts+1;
end %end of "while(lfts<Tit)" 结束内部循环

msgstr1=['Trial # ' int2str(trial) ' has ' int2str(lfts) ' time steps.'];
msgstr21=['Trial # ' int2str(trial) ' has successfully balanced for at least '];
msgstr22=[msgstr21 int2str(lfts) ' time steps '];

if ~failure %如果成功
disp(msgstr22);

for i=1:1000,
Ang(i,1)=xhist(i,1)*Rad2Ang;
Dis(i,1)=xhist(i,3);
end
subplot(2,1,1);
plot(Ang(:,1));
title('Angle(Degree)');
grid on;
subplot(2,1,2);
plot(Dis(:,1));
title('Distance(Meter)');
grid on;

break;
else %如果失败
disp(msgstr1);
end %end of “if ~failure”

if (rem(trial,100)==0 & trial~=MaxTr)
msgstr3=['Press any key to continue...'];
disp(msgstr3);
end

%======================================================================
% weights in and compare to previously stored old values
%======================================================================
if trial~=MaxTr
for s=1:lfts_history-1
trial_state(s,:) = trial_state(s+1,:);
end
trial_state(lfts_history,:) = [lfts,(~failure)];

for i=1:lfts_history-1
if (abs(trial_state(i,1)-trial_state(i+1,1))<=max_lfts_diff)
count=count+1;
end
end

if count==max_stuck_count
%%===========================================================%%
%% initialize weights for crit network and action network. %%
%%===========================================================%%

wc1=(rand(WC_Inputs,N_Hidden)-0.5)*2;
wc2=(rand(N_Hidden,1)-0.5)*2;

wa1=(rand(WA_Inputs,N_Hidden)-0.5)*2;
wa2=(rand(N_Hidden,1)-0.5)*2;
end
end % end of “if trial~=MaxTr”
end % end of for trials 结束外部循环

fprintf('\nRun successful at trial #%d\n\n',trial);
D-027

标签:lfts,end,%%,Qlearning,控制算法,0.5,trial,matlab,exp
From: https://blog.51cto.com/u_15815923/5744748

相关文章