Package RL

Policy Learner

Tutorial

 history_duration = 3  # Duration of state history [s]
 mdp_step = 1  # Step between each state transition [s]
 time_step = 0.1  # time step [s] <-> 10Hz frequency of data acquisition
 mdp = MDP(history_duration, mdp_step, time_step)

 mean = 45 * TORAD
 std = 0 * TORAD
 wind_samples = 10
 WH = np.random.uniform(mean - std, mean + std, size=10)

 hdg0=0*np.ones(10)
 mdp.initializeMDP(hdg0,WH)

 hdg0_rand_vec=(-4,0,2,4,6,8,18,20,21,22,24)

 action_size = 2
 policy_angle = 18
 agent = PolicyLearner(mdp.size, action_size, policy_angle)
 #agent.load("policy_learning_i18_test_long_history")
 batch_size = 120

 EPISODES = 500

 loss_of_episode = []
 for e in range(EPISODES):
     WH = np.random.uniform(mean - std, mean + std, size=10)
     hdg0_rand = random.sample(hdg0_rand_vec, 1)[0]
     hdg0 = hdg0_rand * TORAD * np.ones(10)
     # initialize the incidence randomly
     mdp.simulator.hyst.reset()  #
     #  We reinitialize the memory of the flow
     state = mdp.initializeMDP(hdg0, WH)
     loss_sim_list = []
     for time in range(40):
         # print(time)
         WH = np.random.uniform(mean - std, mean + std, size=wind_samples)
         action = agent.actDeterministicallyUnderPolicy(state)
         next_state, reward = mdp.transition(action, WH)
         agent.remember(state, action, reward, next_state)
         state = next_state
         if len(agent.memory) > batch_size:
             loss_sim_list.append(agent.replay(batch_size))
     loss_over_simulation_time = np.sum(np.array([loss_sim_list])[0]) / len(np.array([loss_sim_list])[0])
     loss_of_episode.append(loss_over_simulation_time)
     print("Initial Heading : {}".format(hdg0_rand))
     print("episode: {}/{}, Mean Loss = {}"
           .format(e, EPISODES, loss_over_simulation_time))