How do I train a model for an environment with variable features using openai gym? - machine-learning

I have a grid network (like a maze) and I want to find the shortest route from the source node to the destination node.
I've coded that and it works well. It finds the shortest route for a network of dimensions x*y and with the specific nodes as the source and destination nodes which are determined as inputs to the environment. env = netEnv(3, 4, 1, 8) means my env is a 3*4 grid network with node 1 as the source node and node 8 as the destination node.
The question is how to train the model for any destination? I mean the model works well if the destination is say node 8. But I want it to be trained so that it will find the answer if the destination is a different node in the environment. I don't want the destination to be always fixed.
How can I do that?
Here is my code:
#import dependecies
import gym
from gym import Env
from gym.spaces import Discrete
from stable_baselines3 import DQN
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import os
#building the environment
class netEnv(Env):
def __init__(self, x, y, sourceNode, destinationNode):
self.action_space = Discrete(4) #0, 1, 2, 3 => up, down, right, left
self.observation_space = Discrete(x*y)
self.sourceNode = sourceNode
self.destinationNode = destinationNode
self.x = x
self.y = y
self.state = self.sourceNode
self.episodeLength = 20
def step(self, action):
#set actions and rewards
if action == 0: #up
if self.state >= self.y:
self.state = self.state - self.y
#print(self.state)
reward = 0.1
else:
reward = 0
elif action == 1: #down
if self.state < (self.x-1)*self.y:
self.state = self.state + self.y
#print(self.state)
reward = 0.1
else:
reward = 0
elif action == 2: #right
if (self.state % self.y) != self.y-1:
self.state += 1
#print(self.state)
reward = 0.1
else:
reward = 0
else: #left
if (self.state % self.y) != 0:
self.state -= 1
#print(self.state)
reward = 0.1
else:
reward = 0
#decrease the remaining time
self.episodeLength -=1
#reward in termination state
if self.state == self.destinationNode:
reward += self.episodeLength
#set done
if self.episodeLength <= 0 or self.state == self.destinationNode:
done = True
else:
done = False
#set info
info = {}
return self.state, reward, done, info
def render(self):
pass
def reset(self):
self.state = self.sourceNode
self.episodeLength = 20
return self.state
env = netEnv(3, 4, 1, 8)
#training
log_path = os.path.join('Training', 'Logs')
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)
model.learn(total_timesteps=15000)
net_path = os.path.join('Training', 'Saved_models', 'net_PPO_15000')
model.save(net_path)
#delete and reload the model
del model
model = PPO.load(net_path, env)
#evaluate the model
print(evaluate_policy(model, env, n_eval_episodes = 10, render = False))

Related

How can I improve this Reinforced Learning scenario in Stable Baselines3?

In this scenario, I present a box observation with numbers 0, 1 or 2 and shape (1, 10).
The odds for 0 and 2 are 2% each, and 96% for 1.
I want the model to learn to pick the index of any 2 that comes. If it doesn't have a 2, just choose 0.
Bellow is my code:
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecFrameStack
action_length = 10
class TestBot(gym.Env):
def __init__(self):
super(TestBot, self).__init__()
self.total_rewards = 0
self.time = 0
self.action_space = spaces.Discrete(action_length)
self.observation_space = spaces.Box(low=0, high=2, shape=(1, action_length), dtype=np.float32)
def generate_next_obs(self):
p = [0.02, 0.02, 0.96]
a = [0, 2, 1]
self.observation = np.random.choice(a, size=(1, action_length), p=p)
if 2 in self.observation[0][1:]:
self.best_reward += 1
def reset(self):
if self.time != 0:
print('Total rewards: ', self.total_rewards, 'Best possible rewards: ', self.best_reward)
self.best_reward = 0
self.time = 0
self.generate_next_obs()
self.total_rewards = 0
self.last_observation = self.observation
return self.observation
def step(self, action):
reward = 0
if action != 0:
last_value = self.last_observation[0][action]
if last_value == 2:
reward = 1
else:
reward = -1
self.time += 1
self.generate_next_obs()
done = self.time == 4096
info = {}
self.last_observation = self.observation
self.total_rewards += reward
return self.observation, reward, done, info
For training, I used the following:
env = TestBot()
env = make_vec_env(lambda: env, n_envs=1)
model = PPO('MlpPolicy', env, verbose=0)
iters = 0
while True:
iters += 1
model.learn(total_timesteps=4096, reset_num_timesteps=True)
PPO gave the best result, which wasn't so great. It learned to have positive rewards, but took a long time and got stuck in a point far from optimal.
How can I improve the learning of this scenario?
I managed to solve my problem by tunning the PPO parameters.
I had to change the following parameters:
gamma: from 0.99 to 0. It determines the importance of future rewards in the decision-making process. A value of 0 means that only imediate rewards should be considered.
gae_lambda: from 0.95 to 0.65. The gae_lambda parameter in Reinforcement Learning is used in the calculation of the Generalized Advantage Estimation (GAE). GAE is a method for estimating the advantage function in reinforcement learning, which is a measure of how much better a certain action is compared to the average action. A lower value means that PPO doesn't need to use the GAE too much.
clip_range: from 0.2 to function based. It determines the percentage of the decisions that will be done for exploration. At the end, exploration starts to be irrelevant. So, I made a function that uses a high exploration in the first few iteractions and goes to 0 at the end.
I also made a small modification in the environment in order to penalize more the loss of oportunity of picking a number 2 index, but that is done just to accelerate the training.
The following is my final code:
env = TestBot()
env = make_vec_env(lambda: env, n_envs=1)
iters = 0
def clip_range_schedule():
def real_clip_range(progress):
global iters
cr = 0.2
if iters > 20:
cr = 0.0
elif iters > 12:
cr = 0.05
elif iters > 6:
cr = 0.1
return cr
return real_clip_range
model = PPO('MlpPolicy', env, verbose=0, gamma=0.0, gae_lambda=0.65, clip_range=clip_range_schedule())
while True:
iters += 1
model.learn(total_timesteps=4096, reset_num_timesteps=True)

Error trying to convert pytorch regression model to a classification model

I have a GNN that works for regression however I have changed the nature of the task from regression to classification. I thought it would be as simple as converting the loss functions and output size but I am getting multiple errors. This is my code:
Class GNN(torch.nn.Module):
def __init__(self, gnn, n_layer, tfeature_len, dim, mlp_hidden_unit, feature_mode):
super(GNN, self).__init__()
self.gnn = gnn
self.n_layer = n_layer
self.tfeature_len = tfeature_len
self.dim = dim
self.gnn_layers = ModuleList([])
if gnn in ['gcn', 'gat', 'sage', 'tag']:
for i in range(n_layer):
if gnn == 'gcn':
self.gnn_layers.append(GraphConv(in_feats=tfeature_len if i == 0 else dim,
out_feats=dim,
activation=None if i == n_layer - 1 else torch.relu))
elif gnn == 'gat':
num_heads = 16 # make sure that dim is dividable by num_heads
self.gnn_layers.append(GATConv(in_feats=tfeature_len if i == 0 else dim,
out_feats=dim // num_heads,
activation=None if i == n_layer - 1 else torch.relu,
num_heads=num_heads))
elif gnn == 'sage':
agg = 'pool'
self.gnn_layers.append(SAGEConv(in_feats=tfeature_len if i == 0 else dim,
out_feats=dim,
activation=None if i == n_layer - 1 else torch.relu,
aggregator_type=agg))
elif gnn == 'tag':
hops = 2
self.gnn_layers.append(TAGConv(in_feats=tfeature_len if i == 0 else dim,
out_feats=dim,
activation=None if i == n_layer - 1 else torch.relu,
k=hops))
elif gnn == 'sgc':
self.gnn_layers.append(SGConv(in_feats=tfeature_len, out_feats=dim, k=n_layer))
else:
raise ValueError('unknown GNN model')
self.factor = None
self.pooling_layer = SumPooling()
self.mlp_hidden_unit = mlp_hidden_unit
self.feature_mode = feature_mode
if self.feature_mode == 'concat':
self.mlp_hidden_layer = torch.nn.Linear(2 * self.dim, self.mlp_hidden_unit)
elif self.feature_mode == 'subtract':
self.mlp_hidden_layer = torch.nn.Linear(self.dim, self.mlp_hidden_unit)
else:
raise ValueError('unknown feature mode')
self.mlp_output_layer = torch.nn.Linear(self.mlp_hidden_unit, 2)
def forward(self, graph1, graph2):
graph1_embedding = self.calculate_embedding(graph1)
graph2_embedding = self.calculate_embedding(graph2)
if self.feature_mode == 'concat':
hidden = relu(self.mlp_hidden_layer(torch.concat([graph1_embedding, graph2_embedding], dim=-1)))
elif self.feature_mode == 'subtract':
hidden = relu(self.mlp_hidden_layer(graph1_embedding - graph2_embedding))
else:
raise ValueError('unknown feature mode')
output = self.mlp_output_layer(hidden)
return output
def calculate_embedding(self, graph):
feature = graph.ndata['feature']
h = one_hot(feature, num_classes=self.tfeature_len)
h = torch.sum(h, dim=1, dtype=torch.float)
for layer in self.gnn_layers:
h = layer(graph, h)
if self.gnn == 'gat':
h = torch.reshape(h, [h.size()[0], -1])
if self.factor is None:
self.factor = math.sqrt(self.dim) / float(torch.mean(torch.linalg.norm(h, dim=1)))
h *= self.factor
graph_embedding = self.pooling_layer(graph, h)
return graph_embedding
def train(data,model,optimizer):
train_loader, val_loader, test_loader, tfeature_len = data
loss_fn = torch.nn.CrossEntropyLoss()
epoch = 23
model = model.to(device)
print('start training\n')
#evaluate(model, 'train', train_loader)
evaluate(model, 'val', val_loader)
evaluate(model, 'test', test_loader)
epoch_losses = []
for i in range(epoch):
print('epoch %d:' % i)
epoch_loss = 0
model.train()
for graph1, graph2, target in train_loader:
pred = torch.squeeze(model(graph1, graph2))
loss = loss_fn(pred, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.detach().item()
epoch_loss /= (iter + 1)
print('Epoch {}, loss {:.4f}'.format(epoch, epoch_loss))
epoch_losses.append(epoch_loss)
#evaluate(model, 'train', train_loader)
evaluate(model, 'val', val_loader)
evaluate(model, 'test', test_loader)
print()
def evaluate(model, mode, data):
pred_list = []
target_list = []
model.eval()
with torch.no_grad():
for graph1, graph2, target in data:
outputs = torch.softmax(model(graph1, graph2), 1)
_, predicted = torch.max(outputs.data, 1)
pred_list.append(predicted)
target_list.append(target)
#torch.sum(preds == targets).detach().cpu().numpy().
pred_list = torch.concat(pred_list)
target_list = torch.concat(target_list)
#print('%s Acc: %.4f' % (mode (sklearn.metrics.accuracy_score(target_list, pred_list, normalize=False)) / len(pred_list) * 10
The accuracy is commented out at the moment because that too gave me an error. My first error however was the following:
start training
epoch 0:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-84-9de6c6dbd2ee> in <module>
----> 1 train(data,model,optimizer)
3 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
3012 if size_average is not None or reduce is not None:
3013 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 3014 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
3015
3016
RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'
My targets are as follows:
targets = training_data['Class'].tolist()
targets = torch.Tensor(targets)
targets = targets.to(device)
And is just a list of 1s and 0s.
I call my model as follows:
model = GNN('sage', 3, tfeature_len, 2048, 100, 'subtract')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
How I can fix this?

In ROS turtlesim how to move turtle in sine curve

In ROS turtlesim, how can we move turtle in sine path? I know we need to use proportional controller to achieve this. But am not getting actual method to do so. I have attached the code for the same which i have tried till now
Note: in callback function i have converted 0 to 2pi scale to -pi to pi scale which is used in ros
#!/usr/bin/env python
import rospy
from geometry_msgs.msg import Twist
from turtlesim.msg import Pose
PI = 3.1415926535897
import math
# Initial value of theta is 0
theta = 0
# Subscriber callback function
def pose_callback(pose):
global theta
req = 2 * math.pi
if pose.theta < 0:
alpha = req - (pose.theta + (2 * math.pi))
else:
alpha = req - pose.theta
alpha = 2 * math.pi - alpha
theta = alpha
# sin_graph function
def sin_graph():
# Starts a new node
global theta
rospy.init_node('sin_graph', anonymous=True)
# Initialization of publisher
velocity_publisher = rospy.Publisher(
'/turtle1/cmd_vel', Twist, queue_size=10)
# Subscribing to topic Pose
rospy.Subscriber("/turtle1/pose", Pose, pose_callback)
vel_msg = Twist()
# Initializing basic data
speed = 0.2
radius = 1
vel_msg.linear.x = speed
vel_msg.linear.y = 0
vel_msg.linear.z = 0
vel_msg.angular.x = 0
vel_msg.angular.y = 0
vel_msg.angular.z = speed/radius
# Rate at which message is published (10 times per second)
rate = rospy.Rate(10)
# Loop until current distance is re-initialized to zero(theta = 0)
while not rospy.is_shutdown():
vel_msg.linear.x = speed * math.cos(theta)
vel_msg.angular.z = math.sin(theta)
velocity_publisher.publish(vel_msg)
rospy.loginfo("Moving in a sine curve")
print(theta)
rate.sleep()
# Forcing our robot to stop
print("Goal Reached")
vel_msg.linear.x = 0
vel_msg.angular.z = 0
velocity_publisher.publish(vel_msg)
rospy.spin()
if __name__ == '__main__':
try:
# Testing our function
sin_graph()
except rospy.ROSInterruptException:
pass

Chainer how to save and load DQN model

I'm learning the Deep Reinforcement learning
framework Chainer.
I've followed a tutorial and gotten the following code:
def train_dddqn(env):
class Q_Network(chainer.Chain):
def __init__(self, input_size, hidden_size, output_size):
super(Q_Network, self).__init__(
fc1=L.Linear(input_size, hidden_size),
fc2=L.Linear(hidden_size, hidden_size),
fc3=L.Linear(hidden_size, hidden_size // 2),
fc4=L.Linear(hidden_size, hidden_size // 2),
state_value=L.Linear(hidden_size // 2, 1),
advantage_value=L.Linear(hidden_size // 2, output_size)
)
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
def __call__(self, x):
h = F.relu(self.fc1(x))
h = F.relu(self.fc2(h))
hs = F.relu(self.fc3(h))
ha = F.relu(self.fc4(h))
state_value = self.state_value(hs)
advantage_value = self.advantage_value(ha)
advantage_mean = (F.sum(advantage_value, axis=1) / float(self.output_size)).reshape(-1, 1)
q_value = F.concat([state_value for _ in range(self.output_size)], axis=1) + (
advantage_value - F.concat([advantage_mean for _ in range(self.output_size)], axis=1))
return q_value
def reset(self):
self.cleargrads()
Q = Q_Network(input_size=env.history_t + 1, hidden_size=100, output_size=3)
Q_ast = copy.deepcopy(Q)
optimizer = chainer.optimizers.Adam()
optimizer.setup(Q)
epoch_num = 50
step_max = len(env.data) - 1
memory_size = 200
batch_size = 50
epsilon = 1.0
epsilon_decrease = 1e-3
epsilon_min = 0.1
start_reduce_epsilon = 200
train_freq = 10
update_q_freq = 20
gamma = 0.97
show_log_freq = 5
memory = []
total_step = 0
total_rewards = []
total_losses = []
start = time.time()
for epoch in range(epoch_num):
pobs = env.reset()
step = 0
done = False
total_reward = 0
total_loss = 0
while not done and step < step_max:
# select act
pact = np.random.randint(3)
if np.random.rand() > epsilon:
pact = Q(np.array(pobs, dtype=np.float32).reshape(1, -1))
pact = np.argmax(pact.data)
# act
obs, reward, done = env.step(pact)
# add memory
memory.append((pobs, pact, reward, obs, done))
if len(memory) > memory_size:
memory.pop(0)
# train or update q
if len(memory) == memory_size:
if total_step % train_freq == 0:
shuffled_memory = np.random.permutation(memory)
memory_idx = range(len(shuffled_memory))
for i in memory_idx[::batch_size]:
batch = np.array(shuffled_memory[i:i + batch_size])
b_pobs = np.array(batch[:, 0].tolist(), dtype=np.float32).reshape(batch_size, -1)
b_pact = np.array(batch[:, 1].tolist(), dtype=np.int32)
b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32)
b_obs = np.array(batch[:, 3].tolist(), dtype=np.float32).reshape(batch_size, -1)
b_done = np.array(batch[:, 4].tolist(), dtype=np.bool)
q = Q(b_pobs)
indices = np.argmax(q.data, axis=1)
maxqs = Q_ast(b_obs).data
target = copy.deepcopy(q.data)
for j in range(batch_size):
Q.reset()
loss = F.mean_squared_error(q, target)
total_loss += loss.data
loss.backward()
optimizer.update()
if total_step % update_q_freq == 0:
Q_ast = copy.deepcopy(Q)
# epsilon
if epsilon > epsilon_min and total_step > start_reduce_epsilon:
epsilon -= epsilon_decrease
# next step
total_reward += reward
pobs = obs
step += 1
total_step += 1
total_rewards.append(total_reward)
total_losses.append(total_loss)
if (epoch + 1) % show_log_freq == 0:
log_reward = sum(total_rewards[((epoch + 1) - show_log_freq):]) / show_log_freq
log_loss = sum(total_losses[((epoch + 1) - show_log_freq):]) / show_log_freq
elapsed_time = time.time() - start
print('\t'.join(map(str, [epoch + 1, epsilon, total_step, log_reward, log_loss, elapsed_time])))
start = time.time()
return Q, total_losses, total_rewards
Q, total_losses, total_rewards = train_dddqn(Environment1(train))
My question is how can I save and load this Model which has been train very well?I know Kreas has some function like: model.save and load_model.
So what's the specify code I need for this Chainer code?
You can use serializer module to save/load chainer's model's parameter (Chain class).
from chainer import serializers
Q = Q_Network(input_size=env.history_t + 1, hidden_size=100, output_size=3)
Q_ast = Q_Network(input_size=env.history_t + 1, hidden_size=100, output_size=3)
# --- train Q here... ---
# copy Q parameter into Q_ast by saving Q's parameter and load to Q_ast
serializers.save_npz('my.model', Q)
serializers.load_npz('my.model', Q_ast)
See official document for details:
http://docs.chainer.org/en/stable/guides/serializers.html
Also, you may refer chainerrl, which is a chainer library for reinforcement learning.
https://github.com/chainer/chainerrl
chainerrl have a util function copy_param to copy parameter from network source_link to target_link.
https://github.com/chainer/chainerrl/blob/master/chainerrl/misc/copy_param.py#L12-L30

keras change the parameters during training

I have a customized layer to do a simple linear-transformation. like x*w+b. I want to change the w and b during the training, is that possible? For example, I want w1 in the first iteration and w2 in second iteration.(w1 and w2 defined by myself).
Of course, you can do it, but you need to do it in a smart way. Here is some code you can play with.
from keras import backend as K
from keras.layers import *
from keras.models import *
import numpy as np
class MyDense( Layer ) :
def __init__( self, units=64, use_bias=True, **kwargs ) :
super(MyDense, self).__init__( **kwargs )
self.units = units
self.use_bias = use_bias
return
def build( self, input_shape ) :
input_dim = input_shape[-1]
self.count = 0
self.w1 = self.add_weight(shape=(input_dim, self.units), initializer='glorot_uniform', name='w1')
self.w0 = self.add_weight(shape=(input_dim, self.units), initializer='glorot_uniform', name='w0')
if self.use_bias:
self.bias = self.add_weight(shape=(self.units,),initializer='glorot_uniform',name='bias' )
else:
self.bias = None
self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
self.built = True
return
def call( self, x ) :
if self.count % 2 == 1 :
c0, c1 = 0, 1
else :
c0, c1 = 1, 0
w = c0 * self.w0 + c1 * self.w1
self.count += 1
output = K.dot( x, w )
if self.use_bias:
output = K.bias_add(output, self.bias, data_format='channels_last')
return output
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 2
assert input_shape[-1]
output_shape = list(input_shape)
output_shape[-1] = self.units
return tuple(output_shape)
# define a dummy model
x = Input(shape=(128,))
y = MyDense(10)(x)
y = Dense(1, activation='sigmoid')(y)
model = Model(inputs=x, outputs=y)
print model.summary()
# get some dummy data
a = np.random.randn(100,128)
b = (np.random.randn(100,) > 0).astype('int32')
# compile and train
model.compile('adam', 'binary_crossentropy')
model.fit( a, b )
Note: the following code is equivalent to what we did above, but it will NOT work !!!
if self.count % 2 == 1 :
w = self.w0
else :
w = self.w1
Why? Because having zero gradients (the former implementation) for one variable is NOT equivalent to having None gradients (the later implementation).

Resources