Hak Cipta 2021 The TF-Agents Authors.
Lihat di TensorFlow.org | Jalankan di Google Colab | Lihat sumber di GitHub | Unduh buku catatan |
pengantar
Pola umum dalam pembelajaran penguatan adalah menjalankan kebijakan di lingkungan untuk sejumlah langkah atau episode tertentu. Ini terjadi, misalnya, selama pengumpulan data, evaluasi, dan pembuatan video agen.
Sementara ini relatif mudah untuk menulis di python, itu jauh lebih kompleks untuk menulis dan debug di TensorFlow karena melibatkan tf.while
loop, tf.cond
dan tf.control_dependencies
. Oleh karena itu kami abstrak gagasan dari sebuah loop lari ke kelas yang disebut driver
, dan menyediakan implementasi teruji baik di Python dan TensorFlow.
Selain itu, data yang ditemui oleh driver pada setiap langkah disimpan dalam tuple bernama yang disebut Trajectory dan disiarkan ke sekumpulan pengamat seperti buffer replay dan metrik. Data ini meliputi pengamatan dari lingkungan, tindakan yang direkomendasikan oleh kebijakan, imbalan yang diperoleh, jenis arus dan langkah selanjutnya, dll.
Mempersiapkan
Jika Anda belum menginstal tf-agents atau gym, jalankan:
pip install tf-agents
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.policies import random_py_policy
from tf_agents.policies import random_tf_policy
from tf_agents.metrics import py_metrics
from tf_agents.metrics import tf_metrics
from tf_agents.drivers import py_driver
from tf_agents.drivers import dynamic_episode_driver
Driver Python
The PyDriver
kelas mengambil lingkungan python, kebijakan python dan daftar pengamat update pada setiap langkah. Metode utama run()
, yang langkah lingkungan menggunakan tindakan dari kebijakan sampai setidaknya salah satu kriteria terminasi berikut terpenuhi: Jumlah langkah mencapai max_steps
atau jumlah episode mencapai max_episodes
.
Implementasinya kira-kira sebagai berikut:
class PyDriver(object):
def __init__(self, env, policy, observers, max_steps=1, max_episodes=1):
self._env = env
self._policy = policy
self._observers = observers or []
self._max_steps = max_steps or np.inf
self._max_episodes = max_episodes or np.inf
def run(self, time_step, policy_state=()):
num_steps = 0
num_episodes = 0
while num_steps < self._max_steps and num_episodes < self._max_episodes:
# Compute an action using the policy for the given time_step
action_step = self._policy.action(time_step, policy_state)
# Apply the action to the environment and get the next step
next_time_step = self._env.step(action_step.action)
# Package information into a trajectory
traj = trajectory.Trajectory(
time_step.step_type,
time_step.observation,
action_step.action,
action_step.info,
next_time_step.step_type,
next_time_step.reward,
next_time_step.discount)
for observer in self._observers:
observer(traj)
# Update statistics to check termination
num_episodes += np.sum(traj.is_last())
num_steps += np.sum(~traj.is_boundary())
time_step = next_time_step
policy_state = action_step.state
return time_step, policy_state
Sekarang, mari kita lihat contoh menjalankan kebijakan acak di lingkungan CartPole, menyimpan hasilnya ke buffer replay dan menghitung beberapa metrik.
env = suite_gym.load('CartPole-v0')
policy = random_py_policy.RandomPyPolicy(time_step_spec=env.time_step_spec(),
action_spec=env.action_spec())
replay_buffer = []
metric = py_metrics.AverageReturnMetric()
observers = [replay_buffer.append, metric]
driver = py_driver.PyDriver(
env, policy, observers, max_steps=20, max_episodes=1)
initial_time_step = env.reset()
final_time_step, _ = driver.run(initial_time_step)
print('Replay Buffer:')
for traj in replay_buffer:
print(traj)
print('Average Return: ', metric.result())
Replay Buffer: Trajectory( {'action': array(1), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([-0.01483762, -0.0301547 , -0.02482025, 0.00477367], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(0, dtype=int32)}) Trajectory( {'action': array(1), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([-0.01544072, 0.16531426, -0.02472478, -0.29563585], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(1), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([-0.01213443, 0.3607798 , -0.0306375 , -0.5960129 ], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(1), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([-0.00491884, 0.5563168 , -0.04255775, -0.8981868 ], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(0), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([ 0.0062075 , 0.75198895, -0.06052149, -1.2039375 ], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(0), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([ 0.02124728, 0.5576993 , -0.08460024, -0.9308191 ], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(0), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([ 0.03240127, 0.36381477, -0.10321662, -0.6658752 ], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(1), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([ 0.03967756, 0.17026839, -0.11653412, -0.40739253], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(0), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([ 0.04308293, 0.36683324, -0.12468197, -0.7344236 ], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(0), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([ 0.0504196 , 0.17363413, -0.13937044, -0.48343614], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(1), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([ 0.05389228, -0.0192741 , -0.14903916, -0.23772195], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(1), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([ 0.0535068 , 0.17762792, -0.1537936 , -0.5734562 ], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(0), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([ 0.05705936, 0.37453365, -0.16526273, -0.910366 ], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(0), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([ 0.06455003, 0.18198717, -0.18347006, -0.6738478 ], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(0), 'discount': array(1., dtype=float32), 'next_step_type': array(1, dtype=int32), 'observation': array([ 0.06818977, -0.01017502, -0.19694701, -0.44408032], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(0), 'discount': array(0., dtype=float32), 'next_step_type': array(2, dtype=int32), 'observation': array([ 0.06798627, -0.20204504, -0.20582862, -0.21936782], dtype=float32), 'policy_info': (), 'reward': array(1., dtype=float32), 'step_type': array(1, dtype=int32)}) Trajectory( {'action': array(1), 'discount': array(1., dtype=float32), 'next_step_type': array(0, dtype=int32), 'observation': array([ 0.06394537, -0.39372152, -0.21021597, 0.00199082], dtype=float32), 'policy_info': (), 'reward': array(0., dtype=float32), 'step_type': array(2, dtype=int32)}) Average Return: 16.0
Driver TensorFlow
Kami juga memiliki driver di TensorFlow yang fungsional mirip dengan driver Python, tapi lingkungan penggunaan TF, kebijakan TF, pengamat TF dll Saat ini kami memiliki driver 2 TensorFlow: DynamicStepDriver
, yang berakhir setelah sejumlah tertentu langkah-langkah (valid) lingkungan dan DynamicEpisodeDriver
, yang berakhir setelah sejumlah episode tertentu. Mari kita lihat contoh DynamicEpisode beraksi.
env = suite_gym.load('CartPole-v0')
tf_env = tf_py_environment.TFPyEnvironment(env)
tf_policy = random_tf_policy.RandomTFPolicy(action_spec=tf_env.action_spec(),
time_step_spec=tf_env.time_step_spec())
num_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
observers = [num_episodes, env_steps]
driver = dynamic_episode_driver.DynamicEpisodeDriver(
tf_env, tf_policy, observers, num_episodes=2)
# Initial driver.run will reset the environment and initialize the policy.
final_time_step, policy_state = driver.run()
print('final_time_step', final_time_step)
print('Number of Steps: ', env_steps.result().numpy())
print('Number of Episodes: ', num_episodes.result().numpy())
final_time_step TimeStep( {'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>, 'observation': <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0.01182632, 0.01372784, 0.03056967, 0.04454206]], dtype=float32)>, 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>, 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>}) Number of Steps: 24 Number of Episodes: 2
# Continue running from previous state
final_time_step, _ = driver.run(final_time_step, policy_state)
print('final_time_step', final_time_step)
print('Number of Steps: ', env_steps.result().numpy())
print('Number of Episodes: ', num_episodes.result().numpy())
final_time_step TimeStep( {'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>, 'observation': <tf.Tensor: shape=(1, 4), dtype=float32, numpy= array([[-0.02565088, 0.04813434, -0.04199163, 0.03810809]], dtype=float32)>, 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>, 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>}) Number of Steps: 70 Number of Episodes: 4