deep q network, loss decreasing, but performance not improving











up vote
-3
down vote

favorite












I'm quite new to deep Q networks and machine learning. I've been learning for a few months and I got stuck when trying to play with open AI gym with a deep reinforcement learning setup. Can anyone help explain whats wrong with my code and why the performance isn't increasing for the cartpole even though the loss is.



Code below



import gym
from collections import deque
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

env = gym.make("CartPole-v0")

# create new function for layers to use in the eval net and the target net
def layer(inputs, layersize, output, activation=None):
Weights = tf.Variable(tf.random_normal([layersize,output]))
bias = tf.Variable(tf.zeros([1,output])+0.1)
WX_B = tf.matmul(inputs,Weights) + bias
if activation:
output = activation(WX_B)
else:
output = WX_B
return output

'''PLACE HOLDERS'''
with tf.name_scope('inputs'):
states_PL_target = tf.placeholder(tf.float32,[None,4])
states_PL_eval = tf.placeholder(tf.float32,[None,4])
action_PL = tf.placeholder(tf.float32,[None,])
reward_pl = tf.placeholder(tf.float32,[None,1])

'''TARGET NET'''
with tf.variable_scope('target_net'):
target_l1 = layer(states_PL_target,4,32,activation=tf.nn.relu)
target_l2 = layer(target_l1,32,64,activation=tf.nn.relu)
target_l3 = layer(target_l2,64,32,activation=tf.nn.relu)
target_qvals= layer(target_l3,32,2)

with tf.variable_scope('qtarget'):
discount = tf.Variable(tf.zeros([1,2])+0.9)
gammaq = (discount * target_qvals)
target_net_output = tf.reduce_max(gammaq,axis=1) + reward_pl

####output of this should be reward + 0.9 of next q value

'''EVAL NET'''
with tf.variable_scope('eval_net'):
eval_l1 = layer(states_PL_eval,4,32,activation=tf.nn.relu)
eval_l2 = layer(eval_l1, 32, 64, activation=tf.nn.relu)
eval_l3 = layer(eval_l2,64,32,activation=tf.nn.relu)
qvals_evalnet = layer(eval_l3,32,2)

shape = tf.shape(action_PL)[0]
shape = tf.cast(shape, dtype=tf.float32)
shape = tf.range(shape, dtype=tf.float32)

action_indicies = tf.stack([shape, action_PL], axis=1)
action_indicies = tf.cast(action_indicies, dtype=tf.int32)
eval_net_output_wr_action = tf.gather_nd(params=qvals_evalnet,indices=action_indicies)

EVAL_loss = tf.reduce_mean(tf.squared_difference(target_net_output,eval_net_output_wr_action)) # tf.quared difference instead
train = tf.train.AdamOptimizer(0.001).minimize(EVAL_loss)

''' REPLACEMENT STEP'''
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')

replace = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

observation = env.reset()
step = 0

prev_observation =
prev_observation2 =
runthrough =
cycle = 0
stepsss =
cycless =

batch_size = 20
memory_size = 1000
replace_step = 200
memory_storage =
memory = deque(maxlen=memory_size)
counter = 0


init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

epsilon = 0.0
runs = 100000
iterator = 0
for _ in range(runs):
# env.render()

if len(prev_observation) == 0:
action = env.action_space.sample() # your agent here (this takes random actions)
else:
if np.random.uniform() > epsilon:
action = np.random.choice([0,1])
else:
# target = sess.run(target_net_output, feed_dict={reward_pl:reward_array,states_PL_target:targetstates_array})
action = sess.run(qvals_evalnet, feed_dict={states_PL_eval:currentstates_array})
action = action[0]
action = action.argmax()
epsilon += 1/runs

observation, reward, done, info = env.step(action)
if step == 0:
prev_observation = np.array([0.,0.,0.,0.])
prev_observation2 = observation
elif step > 0:
prev_observation = prev_observation2
prev_observation2 = observation
reward = reward if not done else -1
reward_array = [reward,reward]
reward_array = np.array(reward_array)
reward_array = reward_array.reshape(2,1)
targetstates_array = np.array([observation,observation])
targetstates_array = targetstates_array.reshape(2,4)
currentstates_array = np.array([prev_observation,prev_observation])
currentstates_array = currentstates_array.reshape(2,4)



step += reward
counter += 1

'''LEARNING PART '''

memory.append(np.array([prev_observation,reward,action,observation]))
if len(memory) == memory_size:
memory_storage = np.array(memory)

if counter % replace_step == 0:
sess.run(replace)

if counter >= memory_size:
sample_index = np.random.choice(memory_size,size=batch_size)


if counter >= memory_size:
sample_train = memory_storage[sample_index]
target_states = [obs[3] for obs in sample_train]
eval_states = [obs[0] for obs in sample_train]
actionstaken = [obs[2] for obs in sample_train]
rewardgotten = [obs[1] for obs in sample_train]
target_states = np.array(target_states).astype(np.float32)
eval_states = np.array(eval_states).astype(np.float32)
actionstaken = np.array(actionstaken).astype(np.float32)
rewardgotten = np.array(rewardgotten).astype(np.float32)
rewardgotten = rewardgotten.reshape(-1, 1)


sess.run(train, feed_dict={states_PL_target:target_states,states_PL_eval:eval_states,action_PL:actionstaken,reward_pl:rewardgotten})
loss = sess.run(EVAL_loss,feed_dict={states_PL_target:target_states,states_PL_eval:eval_states,action_PL:actionstaken,reward_pl:rewardgotten})

if done:
env.reset()

if counter >= memory_size:
stepsss.append(step)
cycless.append(cycle)
step = 0
cycle += 1
# print(cycle)
if counter >= memory_size:
# pass
print(loss)


print(stepsss)
plt.plot(cycless,stepsss)
plt.show()









share|improve this question




























    up vote
    -3
    down vote

    favorite












    I'm quite new to deep Q networks and machine learning. I've been learning for a few months and I got stuck when trying to play with open AI gym with a deep reinforcement learning setup. Can anyone help explain whats wrong with my code and why the performance isn't increasing for the cartpole even though the loss is.



    Code below



    import gym
    from collections import deque
    import tensorflow as tf
    import numpy as np
    import matplotlib.pyplot as plt

    env = gym.make("CartPole-v0")

    # create new function for layers to use in the eval net and the target net
    def layer(inputs, layersize, output, activation=None):
    Weights = tf.Variable(tf.random_normal([layersize,output]))
    bias = tf.Variable(tf.zeros([1,output])+0.1)
    WX_B = tf.matmul(inputs,Weights) + bias
    if activation:
    output = activation(WX_B)
    else:
    output = WX_B
    return output

    '''PLACE HOLDERS'''
    with tf.name_scope('inputs'):
    states_PL_target = tf.placeholder(tf.float32,[None,4])
    states_PL_eval = tf.placeholder(tf.float32,[None,4])
    action_PL = tf.placeholder(tf.float32,[None,])
    reward_pl = tf.placeholder(tf.float32,[None,1])

    '''TARGET NET'''
    with tf.variable_scope('target_net'):
    target_l1 = layer(states_PL_target,4,32,activation=tf.nn.relu)
    target_l2 = layer(target_l1,32,64,activation=tf.nn.relu)
    target_l3 = layer(target_l2,64,32,activation=tf.nn.relu)
    target_qvals= layer(target_l3,32,2)

    with tf.variable_scope('qtarget'):
    discount = tf.Variable(tf.zeros([1,2])+0.9)
    gammaq = (discount * target_qvals)
    target_net_output = tf.reduce_max(gammaq,axis=1) + reward_pl

    ####output of this should be reward + 0.9 of next q value

    '''EVAL NET'''
    with tf.variable_scope('eval_net'):
    eval_l1 = layer(states_PL_eval,4,32,activation=tf.nn.relu)
    eval_l2 = layer(eval_l1, 32, 64, activation=tf.nn.relu)
    eval_l3 = layer(eval_l2,64,32,activation=tf.nn.relu)
    qvals_evalnet = layer(eval_l3,32,2)

    shape = tf.shape(action_PL)[0]
    shape = tf.cast(shape, dtype=tf.float32)
    shape = tf.range(shape, dtype=tf.float32)

    action_indicies = tf.stack([shape, action_PL], axis=1)
    action_indicies = tf.cast(action_indicies, dtype=tf.int32)
    eval_net_output_wr_action = tf.gather_nd(params=qvals_evalnet,indices=action_indicies)

    EVAL_loss = tf.reduce_mean(tf.squared_difference(target_net_output,eval_net_output_wr_action)) # tf.quared difference instead
    train = tf.train.AdamOptimizer(0.001).minimize(EVAL_loss)

    ''' REPLACEMENT STEP'''
    t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
    e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')

    replace = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

    observation = env.reset()
    step = 0

    prev_observation =
    prev_observation2 =
    runthrough =
    cycle = 0
    stepsss =
    cycless =

    batch_size = 20
    memory_size = 1000
    replace_step = 200
    memory_storage =
    memory = deque(maxlen=memory_size)
    counter = 0


    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)

    epsilon = 0.0
    runs = 100000
    iterator = 0
    for _ in range(runs):
    # env.render()

    if len(prev_observation) == 0:
    action = env.action_space.sample() # your agent here (this takes random actions)
    else:
    if np.random.uniform() > epsilon:
    action = np.random.choice([0,1])
    else:
    # target = sess.run(target_net_output, feed_dict={reward_pl:reward_array,states_PL_target:targetstates_array})
    action = sess.run(qvals_evalnet, feed_dict={states_PL_eval:currentstates_array})
    action = action[0]
    action = action.argmax()
    epsilon += 1/runs

    observation, reward, done, info = env.step(action)
    if step == 0:
    prev_observation = np.array([0.,0.,0.,0.])
    prev_observation2 = observation
    elif step > 0:
    prev_observation = prev_observation2
    prev_observation2 = observation
    reward = reward if not done else -1
    reward_array = [reward,reward]
    reward_array = np.array(reward_array)
    reward_array = reward_array.reshape(2,1)
    targetstates_array = np.array([observation,observation])
    targetstates_array = targetstates_array.reshape(2,4)
    currentstates_array = np.array([prev_observation,prev_observation])
    currentstates_array = currentstates_array.reshape(2,4)



    step += reward
    counter += 1

    '''LEARNING PART '''

    memory.append(np.array([prev_observation,reward,action,observation]))
    if len(memory) == memory_size:
    memory_storage = np.array(memory)

    if counter % replace_step == 0:
    sess.run(replace)

    if counter >= memory_size:
    sample_index = np.random.choice(memory_size,size=batch_size)


    if counter >= memory_size:
    sample_train = memory_storage[sample_index]
    target_states = [obs[3] for obs in sample_train]
    eval_states = [obs[0] for obs in sample_train]
    actionstaken = [obs[2] for obs in sample_train]
    rewardgotten = [obs[1] for obs in sample_train]
    target_states = np.array(target_states).astype(np.float32)
    eval_states = np.array(eval_states).astype(np.float32)
    actionstaken = np.array(actionstaken).astype(np.float32)
    rewardgotten = np.array(rewardgotten).astype(np.float32)
    rewardgotten = rewardgotten.reshape(-1, 1)


    sess.run(train, feed_dict={states_PL_target:target_states,states_PL_eval:eval_states,action_PL:actionstaken,reward_pl:rewardgotten})
    loss = sess.run(EVAL_loss,feed_dict={states_PL_target:target_states,states_PL_eval:eval_states,action_PL:actionstaken,reward_pl:rewardgotten})

    if done:
    env.reset()

    if counter >= memory_size:
    stepsss.append(step)
    cycless.append(cycle)
    step = 0
    cycle += 1
    # print(cycle)
    if counter >= memory_size:
    # pass
    print(loss)


    print(stepsss)
    plt.plot(cycless,stepsss)
    plt.show()









    share|improve this question


























      up vote
      -3
      down vote

      favorite









      up vote
      -3
      down vote

      favorite











      I'm quite new to deep Q networks and machine learning. I've been learning for a few months and I got stuck when trying to play with open AI gym with a deep reinforcement learning setup. Can anyone help explain whats wrong with my code and why the performance isn't increasing for the cartpole even though the loss is.



      Code below



      import gym
      from collections import deque
      import tensorflow as tf
      import numpy as np
      import matplotlib.pyplot as plt

      env = gym.make("CartPole-v0")

      # create new function for layers to use in the eval net and the target net
      def layer(inputs, layersize, output, activation=None):
      Weights = tf.Variable(tf.random_normal([layersize,output]))
      bias = tf.Variable(tf.zeros([1,output])+0.1)
      WX_B = tf.matmul(inputs,Weights) + bias
      if activation:
      output = activation(WX_B)
      else:
      output = WX_B
      return output

      '''PLACE HOLDERS'''
      with tf.name_scope('inputs'):
      states_PL_target = tf.placeholder(tf.float32,[None,4])
      states_PL_eval = tf.placeholder(tf.float32,[None,4])
      action_PL = tf.placeholder(tf.float32,[None,])
      reward_pl = tf.placeholder(tf.float32,[None,1])

      '''TARGET NET'''
      with tf.variable_scope('target_net'):
      target_l1 = layer(states_PL_target,4,32,activation=tf.nn.relu)
      target_l2 = layer(target_l1,32,64,activation=tf.nn.relu)
      target_l3 = layer(target_l2,64,32,activation=tf.nn.relu)
      target_qvals= layer(target_l3,32,2)

      with tf.variable_scope('qtarget'):
      discount = tf.Variable(tf.zeros([1,2])+0.9)
      gammaq = (discount * target_qvals)
      target_net_output = tf.reduce_max(gammaq,axis=1) + reward_pl

      ####output of this should be reward + 0.9 of next q value

      '''EVAL NET'''
      with tf.variable_scope('eval_net'):
      eval_l1 = layer(states_PL_eval,4,32,activation=tf.nn.relu)
      eval_l2 = layer(eval_l1, 32, 64, activation=tf.nn.relu)
      eval_l3 = layer(eval_l2,64,32,activation=tf.nn.relu)
      qvals_evalnet = layer(eval_l3,32,2)

      shape = tf.shape(action_PL)[0]
      shape = tf.cast(shape, dtype=tf.float32)
      shape = tf.range(shape, dtype=tf.float32)

      action_indicies = tf.stack([shape, action_PL], axis=1)
      action_indicies = tf.cast(action_indicies, dtype=tf.int32)
      eval_net_output_wr_action = tf.gather_nd(params=qvals_evalnet,indices=action_indicies)

      EVAL_loss = tf.reduce_mean(tf.squared_difference(target_net_output,eval_net_output_wr_action)) # tf.quared difference instead
      train = tf.train.AdamOptimizer(0.001).minimize(EVAL_loss)

      ''' REPLACEMENT STEP'''
      t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
      e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')

      replace = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

      observation = env.reset()
      step = 0

      prev_observation =
      prev_observation2 =
      runthrough =
      cycle = 0
      stepsss =
      cycless =

      batch_size = 20
      memory_size = 1000
      replace_step = 200
      memory_storage =
      memory = deque(maxlen=memory_size)
      counter = 0


      init = tf.global_variables_initializer()
      sess = tf.Session()
      sess.run(init)

      epsilon = 0.0
      runs = 100000
      iterator = 0
      for _ in range(runs):
      # env.render()

      if len(prev_observation) == 0:
      action = env.action_space.sample() # your agent here (this takes random actions)
      else:
      if np.random.uniform() > epsilon:
      action = np.random.choice([0,1])
      else:
      # target = sess.run(target_net_output, feed_dict={reward_pl:reward_array,states_PL_target:targetstates_array})
      action = sess.run(qvals_evalnet, feed_dict={states_PL_eval:currentstates_array})
      action = action[0]
      action = action.argmax()
      epsilon += 1/runs

      observation, reward, done, info = env.step(action)
      if step == 0:
      prev_observation = np.array([0.,0.,0.,0.])
      prev_observation2 = observation
      elif step > 0:
      prev_observation = prev_observation2
      prev_observation2 = observation
      reward = reward if not done else -1
      reward_array = [reward,reward]
      reward_array = np.array(reward_array)
      reward_array = reward_array.reshape(2,1)
      targetstates_array = np.array([observation,observation])
      targetstates_array = targetstates_array.reshape(2,4)
      currentstates_array = np.array([prev_observation,prev_observation])
      currentstates_array = currentstates_array.reshape(2,4)



      step += reward
      counter += 1

      '''LEARNING PART '''

      memory.append(np.array([prev_observation,reward,action,observation]))
      if len(memory) == memory_size:
      memory_storage = np.array(memory)

      if counter % replace_step == 0:
      sess.run(replace)

      if counter >= memory_size:
      sample_index = np.random.choice(memory_size,size=batch_size)


      if counter >= memory_size:
      sample_train = memory_storage[sample_index]
      target_states = [obs[3] for obs in sample_train]
      eval_states = [obs[0] for obs in sample_train]
      actionstaken = [obs[2] for obs in sample_train]
      rewardgotten = [obs[1] for obs in sample_train]
      target_states = np.array(target_states).astype(np.float32)
      eval_states = np.array(eval_states).astype(np.float32)
      actionstaken = np.array(actionstaken).astype(np.float32)
      rewardgotten = np.array(rewardgotten).astype(np.float32)
      rewardgotten = rewardgotten.reshape(-1, 1)


      sess.run(train, feed_dict={states_PL_target:target_states,states_PL_eval:eval_states,action_PL:actionstaken,reward_pl:rewardgotten})
      loss = sess.run(EVAL_loss,feed_dict={states_PL_target:target_states,states_PL_eval:eval_states,action_PL:actionstaken,reward_pl:rewardgotten})

      if done:
      env.reset()

      if counter >= memory_size:
      stepsss.append(step)
      cycless.append(cycle)
      step = 0
      cycle += 1
      # print(cycle)
      if counter >= memory_size:
      # pass
      print(loss)


      print(stepsss)
      plt.plot(cycless,stepsss)
      plt.show()









      share|improve this question















      I'm quite new to deep Q networks and machine learning. I've been learning for a few months and I got stuck when trying to play with open AI gym with a deep reinforcement learning setup. Can anyone help explain whats wrong with my code and why the performance isn't increasing for the cartpole even though the loss is.



      Code below



      import gym
      from collections import deque
      import tensorflow as tf
      import numpy as np
      import matplotlib.pyplot as plt

      env = gym.make("CartPole-v0")

      # create new function for layers to use in the eval net and the target net
      def layer(inputs, layersize, output, activation=None):
      Weights = tf.Variable(tf.random_normal([layersize,output]))
      bias = tf.Variable(tf.zeros([1,output])+0.1)
      WX_B = tf.matmul(inputs,Weights) + bias
      if activation:
      output = activation(WX_B)
      else:
      output = WX_B
      return output

      '''PLACE HOLDERS'''
      with tf.name_scope('inputs'):
      states_PL_target = tf.placeholder(tf.float32,[None,4])
      states_PL_eval = tf.placeholder(tf.float32,[None,4])
      action_PL = tf.placeholder(tf.float32,[None,])
      reward_pl = tf.placeholder(tf.float32,[None,1])

      '''TARGET NET'''
      with tf.variable_scope('target_net'):
      target_l1 = layer(states_PL_target,4,32,activation=tf.nn.relu)
      target_l2 = layer(target_l1,32,64,activation=tf.nn.relu)
      target_l3 = layer(target_l2,64,32,activation=tf.nn.relu)
      target_qvals= layer(target_l3,32,2)

      with tf.variable_scope('qtarget'):
      discount = tf.Variable(tf.zeros([1,2])+0.9)
      gammaq = (discount * target_qvals)
      target_net_output = tf.reduce_max(gammaq,axis=1) + reward_pl

      ####output of this should be reward + 0.9 of next q value

      '''EVAL NET'''
      with tf.variable_scope('eval_net'):
      eval_l1 = layer(states_PL_eval,4,32,activation=tf.nn.relu)
      eval_l2 = layer(eval_l1, 32, 64, activation=tf.nn.relu)
      eval_l3 = layer(eval_l2,64,32,activation=tf.nn.relu)
      qvals_evalnet = layer(eval_l3,32,2)

      shape = tf.shape(action_PL)[0]
      shape = tf.cast(shape, dtype=tf.float32)
      shape = tf.range(shape, dtype=tf.float32)

      action_indicies = tf.stack([shape, action_PL], axis=1)
      action_indicies = tf.cast(action_indicies, dtype=tf.int32)
      eval_net_output_wr_action = tf.gather_nd(params=qvals_evalnet,indices=action_indicies)

      EVAL_loss = tf.reduce_mean(tf.squared_difference(target_net_output,eval_net_output_wr_action)) # tf.quared difference instead
      train = tf.train.AdamOptimizer(0.001).minimize(EVAL_loss)

      ''' REPLACEMENT STEP'''
      t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
      e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')

      replace = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

      observation = env.reset()
      step = 0

      prev_observation =
      prev_observation2 =
      runthrough =
      cycle = 0
      stepsss =
      cycless =

      batch_size = 20
      memory_size = 1000
      replace_step = 200
      memory_storage =
      memory = deque(maxlen=memory_size)
      counter = 0


      init = tf.global_variables_initializer()
      sess = tf.Session()
      sess.run(init)

      epsilon = 0.0
      runs = 100000
      iterator = 0
      for _ in range(runs):
      # env.render()

      if len(prev_observation) == 0:
      action = env.action_space.sample() # your agent here (this takes random actions)
      else:
      if np.random.uniform() > epsilon:
      action = np.random.choice([0,1])
      else:
      # target = sess.run(target_net_output, feed_dict={reward_pl:reward_array,states_PL_target:targetstates_array})
      action = sess.run(qvals_evalnet, feed_dict={states_PL_eval:currentstates_array})
      action = action[0]
      action = action.argmax()
      epsilon += 1/runs

      observation, reward, done, info = env.step(action)
      if step == 0:
      prev_observation = np.array([0.,0.,0.,0.])
      prev_observation2 = observation
      elif step > 0:
      prev_observation = prev_observation2
      prev_observation2 = observation
      reward = reward if not done else -1
      reward_array = [reward,reward]
      reward_array = np.array(reward_array)
      reward_array = reward_array.reshape(2,1)
      targetstates_array = np.array([observation,observation])
      targetstates_array = targetstates_array.reshape(2,4)
      currentstates_array = np.array([prev_observation,prev_observation])
      currentstates_array = currentstates_array.reshape(2,4)



      step += reward
      counter += 1

      '''LEARNING PART '''

      memory.append(np.array([prev_observation,reward,action,observation]))
      if len(memory) == memory_size:
      memory_storage = np.array(memory)

      if counter % replace_step == 0:
      sess.run(replace)

      if counter >= memory_size:
      sample_index = np.random.choice(memory_size,size=batch_size)


      if counter >= memory_size:
      sample_train = memory_storage[sample_index]
      target_states = [obs[3] for obs in sample_train]
      eval_states = [obs[0] for obs in sample_train]
      actionstaken = [obs[2] for obs in sample_train]
      rewardgotten = [obs[1] for obs in sample_train]
      target_states = np.array(target_states).astype(np.float32)
      eval_states = np.array(eval_states).astype(np.float32)
      actionstaken = np.array(actionstaken).astype(np.float32)
      rewardgotten = np.array(rewardgotten).astype(np.float32)
      rewardgotten = rewardgotten.reshape(-1, 1)


      sess.run(train, feed_dict={states_PL_target:target_states,states_PL_eval:eval_states,action_PL:actionstaken,reward_pl:rewardgotten})
      loss = sess.run(EVAL_loss,feed_dict={states_PL_target:target_states,states_PL_eval:eval_states,action_PL:actionstaken,reward_pl:rewardgotten})

      if done:
      env.reset()

      if counter >= memory_size:
      stepsss.append(step)
      cycless.append(cycle)
      step = 0
      cycle += 1
      # print(cycle)
      if counter >= memory_size:
      # pass
      print(loss)


      print(stepsss)
      plt.plot(cycless,stepsss)
      plt.show()






      python neural-network deep-learning reinforcement-learning






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited Nov 22 at 16:43









      Skynet

      4,27452640




      4,27452640










      asked Nov 22 at 13:43









      Paulo Abalos

      1




      1





























          active

          oldest

          votes











          Your Answer






          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "1"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: true,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: 10,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53432336%2fdeep-q-network-loss-decreasing-but-performance-not-improving%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown






























          active

          oldest

          votes













          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes
















          draft saved

          draft discarded




















































          Thanks for contributing an answer to Stack Overflow!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          To learn more, see our tips on writing great answers.





          Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


          Please pay close attention to the following guidance:


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53432336%2fdeep-q-network-loss-decreasing-but-performance-not-improving%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          A CLEAN and SIMPLE way to add appendices to Table of Contents and bookmarks

          Calculate evaluation metrics using cross_val_predict sklearn

          Insert data from modal to MySQL (multiple modal on website)