history_duration = 3 # Duration of state history [s]
mdp_step = 1 # Step between each state transition [s]
time_step = 0.1 # time step [s] <-> 10Hz frequency of data acquisition
mdp = MDP(history_duration, mdp_step, time_step)
mean = 45 * TORAD
std = 0 * TORAD
wind_samples = 10
WH = np.random.uniform(mean - std, mean + std, size=10)
hdg0=0*np.ones(10)
mdp.initializeMDP(hdg0,WH)
hdg0_rand_vec=(-4,0,2,4,6,8,18,20,21,22,24)
action_size = 2
policy_angle = 18
agent = PolicyLearner(mdp.size, action_size, policy_angle)
#agent.load("policy_learning_i18_test_long_history")
batch_size = 120
EPISODES = 500
loss_of_episode = []
for e in range(EPISODES):
WH = np.random.uniform(mean - std, mean + std, size=10)
hdg0_rand = random.sample(hdg0_rand_vec, 1)[0]
hdg0 = hdg0_rand * TORAD * np.ones(10)
# initialize the incidence randomly
mdp.simulator.hyst.reset() #
# We reinitialize the memory of the flow
state = mdp.initializeMDP(hdg0, WH)
loss_sim_list = []
for time in range(40):
# print(time)
WH = np.random.uniform(mean - std, mean + std, size=wind_samples)
action = agent.actDeterministicallyUnderPolicy(state)
next_state, reward = mdp.transition(action, WH)
agent.remember(state, action, reward, next_state)
state = next_state
if len(agent.memory) > batch_size:
loss_sim_list.append(agent.replay(batch_size))
loss_over_simulation_time = np.sum(np.array([loss_sim_list])[0]) / len(np.array([loss_sim_list])[0])
loss_of_episode.append(loss_over_simulation_time)
print("Initial Heading : {}".format(hdg0_rand))
print("episode: {}/{}, Mean Loss = {}"
.format(e, EPISODES, loss_over_simulation_time))