Package RL
==========

Policy Learner
-----------------

.. automodule:: policyLearning
	:members:
	:undoc-members:
	:show-inheritance:

Tutorial
---------------


.. code-block:: python
   :emphasize-lines: 37,38,39,42

    history_duration = 3  # Duration of state history [s]
    mdp_step = 1  # Step between each state transition [s]
    time_step = 0.1  # time step [s] <-> 10Hz frequency of data acquisition
    mdp = MDP(history_duration, mdp_step, time_step)

    mean = 45 * TORAD
    std = 0 * TORAD
    wind_samples = 10
    WH = np.random.uniform(mean - std, mean + std, size=10)

    hdg0=0*np.ones(10)
    mdp.initializeMDP(hdg0,WH)

    hdg0_rand_vec=(-4,0,2,4,6,8,18,20,21,22,24)

    action_size = 2
    policy_angle = 18
    agent = PolicyLearner(mdp.size, action_size, policy_angle)
    #agent.load("policy_learning_i18_test_long_history")
    batch_size = 120

    EPISODES = 500

    loss_of_episode = []
    for e in range(EPISODES):
        WH = np.random.uniform(mean - std, mean + std, size=10)
        hdg0_rand = random.sample(hdg0_rand_vec, 1)[0]
        hdg0 = hdg0_rand * TORAD * np.ones(10)
        # initialize the incidence randomly
        mdp.simulator.hyst.reset()  #
        #  We reinitialize the memory of the flow
        state = mdp.initializeMDP(hdg0, WH)
        loss_sim_list = []
        for time in range(40):
            # print(time)
            WH = np.random.uniform(mean - std, mean + std, size=wind_samples)
            action = agent.actDeterministicallyUnderPolicy(state)
            next_state, reward = mdp.transition(action, WH)
            agent.remember(state, action, reward, next_state)
            state = next_state
            if len(agent.memory) > batch_size:
                loss_sim_list.append(agent.replay(batch_size))
        loss_over_simulation_time = np.sum(np.array([loss_sim_list])[0]) / len(np.array([loss_sim_list])[0])
        loss_of_episode.append(loss_over_simulation_time)
        print("Initial Heading : {}".format(hdg0_rand))
        print("episode: {}/{}, Mean Loss = {}"
              .format(e, EPISODES, loss_over_simulation_time))