{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ef6d5298",
   "metadata": {},
   "source": [
    "# Reward, Cost, Termination, and Step Information\n",
    "\n",
    "[![Click and Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/metadriverse/metadrive/blob/main/documentation/source/reward_cost_done.ipynb)\n",
    "\n",
    "\n",
    "\n",
    "Following the standard OpenAI Gym API, after each step of the environment `env.step(...)`, the environment will return a tuple containing five items: `(obs, reward, terminated, truncated, info)`. In this page, we discuss the design of reward function `reward`, cost function `info[\"cost\"]`, termination criterion `terminated` in various settings, truncation information `truncated`, and the details of step information `info`."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed8d0bad",
   "metadata": {},
   "source": [
    "## Reward Function\n",
    "\n",
    "For all environments, reward functions consist of generally a dense driving reward and a sparse terminal reward. The dense reward is the longitudinal movement along the reference line or lane toward destination. When the episode is terminated due to, i.e. arriving the destination or driving out of the road, a sparse reward will be added to the dense reward. In practice, the concrete implementations of reward function are slightly different across all environments. \n",
    "\n",
    "### MetaDriveEnv\n",
    "The reward functions for the `MetaDriveEnv` and derived environments like `Multi-agent Environments` and `SafeMetaDriveEnv` are similar as they all using `PGMap`. The implementation is as follows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fb7b7072",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[34mdef\u001b[39;49;00m \u001b[32mreward_function\u001b[39;49;00m(\u001b[36mself\u001b[39;49;00m, vehicle_id: \u001b[36mstr\u001b[39;49;00m):\n",
      "    \u001b[33m\"\"\"\u001b[39;49;00m\n",
      "\u001b[33m    Override this func to get a new reward function\u001b[39;49;00m\n",
      "\u001b[33m    :param vehicle_id: id of BaseVehicle\u001b[39;49;00m\n",
      "\u001b[33m    :return: reward\u001b[39;49;00m\n",
      "\u001b[33m    \"\"\"\u001b[39;49;00m\n",
      "    vehicle = \u001b[36mself\u001b[39;49;00m.vehicles[vehicle_id]\n",
      "    step_info = \u001b[36mdict\u001b[39;49;00m()\n",
      "\n",
      "    \u001b[37m# Reward for moving forward in current lane\u001b[39;49;00m\n",
      "    \u001b[34mif\u001b[39;49;00m vehicle.lane \u001b[35min\u001b[39;49;00m vehicle.navigation.current_ref_lanes:\n",
      "        current_lane = vehicle.lane\n",
      "        positive_road = \u001b[34m1\u001b[39;49;00m\n",
      "    \u001b[34melse\u001b[39;49;00m:\n",
      "        current_lane = vehicle.navigation.current_ref_lanes[\u001b[34m0\u001b[39;49;00m]\n",
      "        current_road = vehicle.navigation.current_road\n",
      "        positive_road = \u001b[34m1\u001b[39;49;00m \u001b[34mif\u001b[39;49;00m \u001b[35mnot\u001b[39;49;00m current_road.is_negative_road() \u001b[34melse\u001b[39;49;00m -\u001b[34m1\u001b[39;49;00m\n",
      "    long_last, _ = current_lane.local_coordinates(vehicle.last_position)\n",
      "    long_now, lateral_now = current_lane.local_coordinates(vehicle.position)\n",
      "\n",
      "    \u001b[37m# reward for lane keeping, without it vehicle can learn to overtake but fail to keep in lane\u001b[39;49;00m\n",
      "    \u001b[34mif\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33muse_lateral_reward\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]:\n",
      "        lateral_factor = clip(\u001b[34m1\u001b[39;49;00m - \u001b[34m2\u001b[39;49;00m * \u001b[36mabs\u001b[39;49;00m(lateral_now) / vehicle.navigation.get_current_lane_width(), \u001b[34m0.0\u001b[39;49;00m, \u001b[34m1.0\u001b[39;49;00m)\n",
      "    \u001b[34melse\u001b[39;49;00m:\n",
      "        lateral_factor = \u001b[34m1.0\u001b[39;49;00m\n",
      "\n",
      "    reward = \u001b[34m0.0\u001b[39;49;00m\n",
      "    reward += \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mdriving_reward\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] * (long_now - long_last) * lateral_factor * positive_road\n",
      "    reward += \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mspeed_reward\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] * (vehicle.speed_km_h / vehicle.max_speed_km_h) * positive_road\n",
      "\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mstep_reward\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = reward\n",
      "\n",
      "    \u001b[34mif\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m._is_arrive_destination(vehicle):\n",
      "        reward = +\u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33msuccess_reward\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    \u001b[34melif\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m._is_out_of_road(vehicle):\n",
      "        reward = -\u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mout_of_road_penalty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    \u001b[34melif\u001b[39;49;00m vehicle.crash_vehicle:\n",
      "        reward = -\u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mcrash_vehicle_penalty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    \u001b[34melif\u001b[39;49;00m vehicle.crash_object:\n",
      "        reward = -\u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mcrash_object_penalty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    \u001b[34mreturn\u001b[39;49;00m reward, step_info\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from metadrive.envs.metadrive_env import MetaDriveEnv\n",
    "from metadrive.envs.scenario_env import ScenarioEnv\n",
    "from metadrive.utils.doc_utils import print_source\n",
    "print_source(MetaDriveEnv.reward_function)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cc81c42e",
   "metadata": {},
   "source": [
    "This reward function is composed of three parts as follows:\n",
    "\n",
    "$R = c_{1} R_{driving} + c_{2} R_{speed} + R_{termination}$\n",
    "\n",
    "- The **driving reward**  $R_{driving} = d_t - d_{t-1}$, wherein the $d_t$ and $d_{t-1}$ denote the longitudinal coordinates of the target vehicle on the current reference lane of two consecutive time steps, providing dense reward to encourage agent to move toward the destination.\n",
    "- The **speed reward** $R_{speed} = v_t/v_{max}$ incentives agent to drive fast. $v_{t}$ and $v_{max}$ denote the current velocity and the maximum velocity (80 km/h), respectively.\n",
    "- The **termination reward** $R_{termination}$ contains a set of sparse rewards. At the end of episode, other dense rewards will be disabled and only one sparse reward will be given to the agent at the end of the episode according to its termination state. We implement the `success_reward`, `out_of_road_penalty`, `crash_vehicle_penalty` and `crash_object_penalty` currently. The penalty will be given as negative reward.\n",
    "\n",
    "We also provide a config call `use_lateral_reward`, which is a multiplier in range [0, 1] indicating whether the ego vehicle is far from the center of current lane. The multiplier will apply to the driving reward.\n",
    "\n",
    "We summarize the default reward config here:\n",
    "\n",
    "- `success_reward = 10.0`: one of termination reward.\n",
    "- `out_of_road_penalty = 5.0`: will use -5.0 as the termination reward.\n",
    "- `crash_vehicle_penalty = 5.0`: will use -5.0 as the termination reward.\n",
    "- `crash_object_penalty = 5.0`: will use -5.0 as the termination reward.\n",
    "- `driving_reward = 1.0`: the $c_{1}$ in reward function.\n",
    "- `speed_reward = 0.1`: the $c_{2}$ in reward function.\n",
    "- `use_lateral_reward = False`: disable weighting the driving reward according to centering in the lane.\n",
    "\n",
    "### ScenarioEnv\n",
    "The reward function for `ScenarioEnv` is similar to the one of `MetaDriveEnv`, while the calculation of driving reward is slightly different and there are more items. The concrete implementation is:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5056ea42",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[34mdef\u001b[39;49;00m \u001b[32mreward_function\u001b[39;49;00m(\u001b[36mself\u001b[39;49;00m, vehicle_id: \u001b[36mstr\u001b[39;49;00m):\n",
      "    \u001b[33m\"\"\"\u001b[39;49;00m\n",
      "\u001b[33m    Override this func to get a new reward function\u001b[39;49;00m\n",
      "\u001b[33m    :param vehicle_id: id of BaseVehicle\u001b[39;49;00m\n",
      "\u001b[33m    :return: reward\u001b[39;49;00m\n",
      "\u001b[33m    \"\"\"\u001b[39;49;00m\n",
      "    vehicle = \u001b[36mself\u001b[39;49;00m.vehicles[vehicle_id]\n",
      "    step_info = \u001b[36mdict\u001b[39;49;00m()\n",
      "\n",
      "    \u001b[37m# Reward for moving forward in current lane\u001b[39;49;00m\n",
      "    current_lane = vehicle.lane\n",
      "    long_last = vehicle.navigation.last_longitude\n",
      "    long_now = vehicle.navigation.current_longitude\n",
      "    lateral_now = vehicle.navigation.current_lateral\n",
      "\n",
      "    \u001b[37m# dense driving reward\u001b[39;49;00m\n",
      "    reward = \u001b[34m0\u001b[39;49;00m\n",
      "    reward += \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mdriving_reward\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] * (long_now - long_last)\n",
      "\n",
      "    \u001b[37m# reward for lane keeping, without it vehicle can learn to overtake but fail to keep in lane\u001b[39;49;00m\n",
      "    lateral_factor = \u001b[36mabs\u001b[39;49;00m(lateral_now) / \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mmax_lateral_dist\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    lateral_penalty = -lateral_factor * \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mlateral_penalty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    reward += lateral_penalty\n",
      "\n",
      "    \u001b[37m# heading diff\u001b[39;49;00m\n",
      "    ref_line_heading = vehicle.navigation.current_heading_theta_at_long\n",
      "    heading_diff = wrap_to_pi(\u001b[36mabs\u001b[39;49;00m(vehicle.heading_theta - ref_line_heading)) / np.pi\n",
      "    heading_penalty = -heading_diff * \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mheading_penalty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    reward += heading_penalty\n",
      "\n",
      "    \u001b[37m# steering_range\u001b[39;49;00m\n",
      "    steering = \u001b[36mabs\u001b[39;49;00m(vehicle.current_action[\u001b[34m0\u001b[39;49;00m])\n",
      "    allowed_steering = (\u001b[34m1\u001b[39;49;00m / \u001b[36mmax\u001b[39;49;00m(vehicle.speed, \u001b[34m1e-2\u001b[39;49;00m))\n",
      "    overflowed_steering = \u001b[36mmin\u001b[39;49;00m((allowed_steering - steering), \u001b[34m0\u001b[39;49;00m)\n",
      "    steering_range_penalty = overflowed_steering * \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33msteering_range_penalty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    reward += steering_range_penalty\n",
      "\n",
      "    \u001b[34mif\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mno_negative_reward\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]:\n",
      "        reward = \u001b[36mmax\u001b[39;49;00m(reward, \u001b[34m0\u001b[39;49;00m)\n",
      "\n",
      "    \u001b[37m# crash penalty\u001b[39;49;00m\n",
      "    \u001b[34mif\u001b[39;49;00m vehicle.crash_vehicle:\n",
      "        reward = -\u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mcrash_vehicle_penalty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    \u001b[34mif\u001b[39;49;00m vehicle.crash_object:\n",
      "        reward = -\u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mcrash_object_penalty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    \u001b[34mif\u001b[39;49;00m vehicle.crash_human:\n",
      "        reward = -\u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mcrash_human_penalty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    \u001b[37m# lane line penalty\u001b[39;49;00m\n",
      "    \u001b[34mif\u001b[39;49;00m vehicle.on_yellow_continuous_line \u001b[35mor\u001b[39;49;00m vehicle.crash_sidewalk \u001b[35mor\u001b[39;49;00m vehicle.on_white_continuous_line:\n",
      "        reward = -\u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mon_lane_line_penalty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mstep_reward\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = reward\n",
      "\n",
      "    \u001b[37m# termination reward\u001b[39;49;00m\n",
      "    \u001b[34mif\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m._is_arrive_destination(vehicle):\n",
      "        reward = \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33msuccess_reward\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    \u001b[34melif\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m._is_out_of_road(vehicle):\n",
      "        reward = -\u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mout_of_road_penalty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "\n",
      "    \u001b[37m# TODO LQY: all a callback to process these keys\u001b[39;49;00m\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mtrack_length\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = vehicle.navigation.reference_trajectory.length\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mcarsize\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = [vehicle.WIDTH, vehicle.LENGTH]\n",
      "    \u001b[37m# add some new and informative keys\u001b[39;49;00m\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mroute_completion\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = vehicle.navigation.route_completion\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mcurriculum_level\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = \u001b[36mself\u001b[39;49;00m.engine.current_level\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mscenario_index\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = \u001b[36mself\u001b[39;49;00m.engine.current_seed\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mnum_stored_maps\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = \u001b[36mself\u001b[39;49;00m.engine.map_manager.num_stored_maps\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mscenario_difficulty\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = \u001b[36mself\u001b[39;49;00m.engine.data_manager.current_scenario_difficulty\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mdata_coverage\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = \u001b[36mself\u001b[39;49;00m.engine.data_manager.data_coverage\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mcurriculum_success\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = \u001b[36mself\u001b[39;49;00m.engine.curriculum_manager.current_success_rate\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mcurriculum_route_completion\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = \u001b[36mself\u001b[39;49;00m.engine.curriculum_manager.current_route_completion\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mlateral_dist\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = lateral_now\n",
      "\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mstep_reward_lateral\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = lateral_penalty\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mstep_reward_heading\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = heading_penalty\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mstep_reward_action_smooth\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = steering_range_penalty\n",
      "    \u001b[34mreturn\u001b[39;49;00m reward, step_info\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print_source(ScenarioEnv.reward_function)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f8c92ddd",
   "metadata": {},
   "source": [
    "This reward function is composed of six parts as follows:\n",
    "\n",
    "$R = c_{1} R_{driving} + c_{2} R_{lateral} + c_{3} R_{heading} + c_{4} R_{steering} - c_{5} P_{collision} + R_{termination}$\n",
    "\n",
    "- The **driving reward** $R_{driving} = d_t - d_{t-1}$, wherein the $d_t$ and $d_{t-1}$ denote the longitudinal movements of the target vehicle on the recorded trajectory of two consecutive time steps, providing dense reward to encourage agent to move toward the destination.\n",
    "- The **lateral reward** $R_{lateral}$ incentives the agent to driving as close to the reference trajectory as possible. When the distance between the car and the reference trajectory is larger than `max_lateral_dist`, the episode will be terminated due to driving out of road. \n",
    "- The **heading reward** $R_{heading}$ asks the agent to have the same heading as the direction of a certain point on the lane. The point is calculated per step by projecting the position of the vehicle to the reference line (recorded trajectory). \n",
    "- The **steering reward** $R_{steering}$ is for yielding large steering actions when the speed is high. The higher the speed is, the larger the penalty for the large steering angle will be.\n",
    "- The **collision penalty** $P_{collision}$ is a constant scalar for penalizing any collision behaviors with certain type of traffic participants.\n",
    "- The **termination reward** $R_{termination}$ contains a set of sparse rewards. At the end of episode, other dense rewards will be disabled and only one sparse reward will be given to the agent at the end of the episode according to its termination state. We use the `success_reward`, `out_of_road_penalty` for this environment. The penalty will be given as negative reward.\n",
    "\n",
    "There is a special config `no_negative_reward`. When setting it to `True`, the final reward will be clipped to `reward=max(reward, 0)`. This is helpful for stabilizing the training. We summarize the default reward config of the `ScenarioEnv` here:\n",
    "\n",
    "- success_reward=5.0,\n",
    "- out_of_road_penalty=5.0,\n",
    "- on_lane_line_penalty=1.,\n",
    "- crash_vehicle_penalty=1.,\n",
    "- crash_object_penalty=1.0,\n",
    "- crash_human_penalty=1.0,\n",
    "- driving_reward=1.0,\n",
    "- steering_range_penalty=0.5,\n",
    "- heading_penalty=1.0,\n",
    "- lateral_penalty=.5,\n",
    "- max_lateral_dist=4,\n",
    "- no_negative_reward=True,\n",
    "\n",
    "It is worth noting that we write some information like the lateral reward and heading reward to the `step_info`. Thus we can keep track of the training status of the agent."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e11c8234",
   "metadata": {},
   "source": [
    "## Cost Function\n",
    "\n",
    "Similar to the reward function, we also provide default cost function to measure the safety during driving. The cost function will be placed in the returned information dict as `info[\"cost\"]` after `env.step` function.\n",
    "\n",
    "- `crash_vehicle_cost = 1.0`: yield cost when crashing to other vehicles.\n",
    "- `crash_human_cost = 1.0`: yield cost when crashing to other vehicles.\n",
    "- `crash_object_cost = 1.0`: yield cost when crashing to objects, such as cones and triangles.\n",
    "- `out_of_road_cost = 1.0`: yield cost when driving out of the road.\n",
    "\n",
    "The implementation of cost function is simple and almost the same for `MetaDriveEnv` and `ScenarioEnv`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "227b6983",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[34mdef\u001b[39;49;00m \u001b[32mcost_function\u001b[39;49;00m(\u001b[36mself\u001b[39;49;00m, vehicle_id: \u001b[36mstr\u001b[39;49;00m):\n",
      "    vehicle = \u001b[36mself\u001b[39;49;00m.vehicles[vehicle_id]\n",
      "    step_info = \u001b[36mdict\u001b[39;49;00m()\n",
      "    step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mcost\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = \u001b[34m0\u001b[39;49;00m\n",
      "    \u001b[34mif\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m._is_out_of_road(vehicle):\n",
      "        step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mcost\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mout_of_road_cost\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    \u001b[34melif\u001b[39;49;00m vehicle.crash_vehicle:\n",
      "        step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mcost\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mcrash_vehicle_cost\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    \u001b[34melif\u001b[39;49;00m vehicle.crash_object:\n",
      "        step_info[\u001b[33m\"\u001b[39;49;00m\u001b[33mcost\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] = \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mcrash_object_cost\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    \u001b[34mreturn\u001b[39;49;00m step_info[\u001b[33m'\u001b[39;49;00m\u001b[33mcost\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m], step_info\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from metadrive.utils.doc_utils import print_source\n",
    "from metadrive.envs import MetaDriveEnv\n",
    "print_source(MetaDriveEnv.cost_function)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c6d4d7c",
   "metadata": {},
   "source": [
    "You can modify this function to add more information to the `step_info` dict. For example, you can log what kind of object raises this cost. Thus you can calculate how many cars the ego vehicle collides with in one episode by summing up the number of vehicle crashes in each step.  "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc0f70d9",
   "metadata": {},
   "source": [
    "## Termination and Truncation\n",
    "\n",
    "MetaDrive will terminate an episode of a vehicle if:\n",
    "\n",
    "1. the target vehicle arrive its destination,\n",
    "2. the vehicle drives out of the road,\n",
    "3. the vehicle crashes to other vehicles,\n",
    "4. the vehicle crashes to obstacles,\n",
    "5. the vehicle crashes to human, \n",
    "6. reach max step (horizon) limits, or\n",
    "7. the vehicle crashes to building (e.g. in Multi-agent Tollgate environment).\n",
    "\n",
    "The above termination function is implemented as:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "96d6424d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[34mdef\u001b[39;49;00m \u001b[32mdone_function\u001b[39;49;00m(\u001b[36mself\u001b[39;49;00m, vehicle_id: \u001b[36mstr\u001b[39;49;00m):\n",
      "    vehicle = \u001b[36mself\u001b[39;49;00m.vehicles[vehicle_id]\n",
      "    done = \u001b[34mFalse\u001b[39;49;00m\n",
      "    max_step = \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mhorizon\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m] \u001b[35mis\u001b[39;49;00m \u001b[35mnot\u001b[39;49;00m \u001b[34mNone\u001b[39;49;00m \u001b[35mand\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m.episode_lengths[vehicle_id] >= \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mhorizon\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]\n",
      "    done_info = {\n",
      "        TerminationState.CRASH_VEHICLE: vehicle.crash_vehicle,\n",
      "        TerminationState.CRASH_OBJECT: vehicle.crash_object,\n",
      "        TerminationState.CRASH_BUILDING: vehicle.crash_building,\n",
      "        TerminationState.CRASH_HUMAN: vehicle.crash_human,\n",
      "        TerminationState.CRASH_SIDEWALK: vehicle.crash_sidewalk,\n",
      "        TerminationState.OUT_OF_ROAD: \u001b[36mself\u001b[39;49;00m._is_out_of_road(vehicle),\n",
      "        TerminationState.SUCCESS: \u001b[36mself\u001b[39;49;00m._is_arrive_destination(vehicle),\n",
      "        TerminationState.MAX_STEP: max_step,\n",
      "        TerminationState.ENV_SEED: \u001b[36mself\u001b[39;49;00m.current_seed,\n",
      "        \u001b[37m# TerminationState.CURRENT_BLOCK: self.agent.navigation.current_road.block_ID(),\u001b[39;49;00m\n",
      "        \u001b[37m# crash_vehicle=False, crash_object=False, crash_building=False, out_of_road=False, arrive_dest=False,\u001b[39;49;00m\n",
      "    }\n",
      "\n",
      "    \u001b[37m# for compatibility\u001b[39;49;00m\n",
      "    \u001b[37m# crash almost equals to crashing with vehicles\u001b[39;49;00m\n",
      "    done_info[TerminationState.CRASH] = (\n",
      "        done_info[TerminationState.CRASH_VEHICLE] \u001b[35mor\u001b[39;49;00m done_info[TerminationState.CRASH_OBJECT]\n",
      "        \u001b[35mor\u001b[39;49;00m done_info[TerminationState.CRASH_BUILDING] \u001b[35mor\u001b[39;49;00m done_info[TerminationState.CRASH_SIDEWALK]\n",
      "        \u001b[35mor\u001b[39;49;00m done_info[TerminationState.CRASH_HUMAN]\n",
      "    )\n",
      "\n",
      "    \u001b[37m# determine env return\u001b[39;49;00m\n",
      "    \u001b[34mif\u001b[39;49;00m done_info[TerminationState.SUCCESS]:\n",
      "        done = \u001b[34mTrue\u001b[39;49;00m\n",
      "        \u001b[36mself\u001b[39;49;00m.logger.info(\n",
      "            \u001b[33m\"\u001b[39;49;00m\u001b[33mEpisode ended! Scenario Index: \u001b[39;49;00m\u001b[33m{}\u001b[39;49;00m\u001b[33m Reason: arrive_dest.\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m.format(\u001b[36mself\u001b[39;49;00m.current_seed),\n",
      "            extra={\u001b[33m\"\u001b[39;49;00m\u001b[33mlog_once\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m: \u001b[34mTrue\u001b[39;49;00m}\n",
      "        )\n",
      "    \u001b[34mif\u001b[39;49;00m done_info[TerminationState.OUT_OF_ROAD]:\n",
      "        done = \u001b[34mTrue\u001b[39;49;00m\n",
      "        \u001b[36mself\u001b[39;49;00m.logger.info(\n",
      "            \u001b[33m\"\u001b[39;49;00m\u001b[33mEpisode ended! Scenario Index: \u001b[39;49;00m\u001b[33m{}\u001b[39;49;00m\u001b[33m Reason: out_of_road.\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m.format(\u001b[36mself\u001b[39;49;00m.current_seed),\n",
      "            extra={\u001b[33m\"\u001b[39;49;00m\u001b[33mlog_once\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m: \u001b[34mTrue\u001b[39;49;00m}\n",
      "        )\n",
      "    \u001b[34mif\u001b[39;49;00m done_info[TerminationState.CRASH_VEHICLE] \u001b[35mand\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mcrash_vehicle_done\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]:\n",
      "        done = \u001b[34mTrue\u001b[39;49;00m\n",
      "        \u001b[36mself\u001b[39;49;00m.logger.info(\n",
      "            \u001b[33m\"\u001b[39;49;00m\u001b[33mEpisode ended! Scenario Index: \u001b[39;49;00m\u001b[33m{}\u001b[39;49;00m\u001b[33m Reason: crash vehicle \u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m.format(\u001b[36mself\u001b[39;49;00m.current_seed),\n",
      "            extra={\u001b[33m\"\u001b[39;49;00m\u001b[33mlog_once\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m: \u001b[34mTrue\u001b[39;49;00m}\n",
      "        )\n",
      "    \u001b[34mif\u001b[39;49;00m done_info[TerminationState.CRASH_OBJECT] \u001b[35mand\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mcrash_object_done\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]:\n",
      "        done = \u001b[34mTrue\u001b[39;49;00m\n",
      "        \u001b[36mself\u001b[39;49;00m.logger.info(\n",
      "            \u001b[33m\"\u001b[39;49;00m\u001b[33mEpisode ended! Scenario Index: \u001b[39;49;00m\u001b[33m{}\u001b[39;49;00m\u001b[33m Reason: crash object \u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m.format(\u001b[36mself\u001b[39;49;00m.current_seed),\n",
      "            extra={\u001b[33m\"\u001b[39;49;00m\u001b[33mlog_once\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m: \u001b[34mTrue\u001b[39;49;00m}\n",
      "        )\n",
      "    \u001b[34mif\u001b[39;49;00m done_info[TerminationState.CRASH_BUILDING]:\n",
      "        done = \u001b[34mTrue\u001b[39;49;00m\n",
      "        \u001b[36mself\u001b[39;49;00m.logger.info(\n",
      "            \u001b[33m\"\u001b[39;49;00m\u001b[33mEpisode ended! Scenario Index: \u001b[39;49;00m\u001b[33m{}\u001b[39;49;00m\u001b[33m Reason: crash building \u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m.format(\u001b[36mself\u001b[39;49;00m.current_seed),\n",
      "            extra={\u001b[33m\"\u001b[39;49;00m\u001b[33mlog_once\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m: \u001b[34mTrue\u001b[39;49;00m}\n",
      "        )\n",
      "    \u001b[34mif\u001b[39;49;00m done_info[TerminationState.CRASH_HUMAN] \u001b[35mand\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mcrash_human_done\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]:\n",
      "        done = \u001b[34mTrue\u001b[39;49;00m\n",
      "        \u001b[36mself\u001b[39;49;00m.logger.info(\n",
      "            \u001b[33m\"\u001b[39;49;00m\u001b[33mEpisode ended! Scenario Index: \u001b[39;49;00m\u001b[33m{}\u001b[39;49;00m\u001b[33m Reason: crash human\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m.format(\u001b[36mself\u001b[39;49;00m.current_seed),\n",
      "            extra={\u001b[33m\"\u001b[39;49;00m\u001b[33mlog_once\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m: \u001b[34mTrue\u001b[39;49;00m}\n",
      "        )\n",
      "    \u001b[34mif\u001b[39;49;00m done_info[TerminationState.MAX_STEP]:\n",
      "        \u001b[37m# single agent horizon has the same meaning as max_step_per_agent\u001b[39;49;00m\n",
      "        \u001b[34mif\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m.config[\u001b[33m\"\u001b[39;49;00m\u001b[33mtruncate_as_terminate\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m]:\n",
      "            done = \u001b[34mTrue\u001b[39;49;00m\n",
      "        \u001b[36mself\u001b[39;49;00m.logger.info(\n",
      "            \u001b[33m\"\u001b[39;49;00m\u001b[33mEpisode ended! Scenario Index: \u001b[39;49;00m\u001b[33m{}\u001b[39;49;00m\u001b[33m Reason: max step \u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m.format(\u001b[36mself\u001b[39;49;00m.current_seed),\n",
      "            extra={\u001b[33m\"\u001b[39;49;00m\u001b[33mlog_once\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m: \u001b[34mTrue\u001b[39;49;00m}\n",
      "        )\n",
      "    \u001b[34mreturn\u001b[39;49;00m done, done_info\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print_source(MetaDriveEnv.done_function)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24ca6bfe",
   "metadata": {},
   "source": [
    "Please note that in the Safe RL environment `SafeMetaDriveEnv`, the episode will not be terminated when vehicles crashing into objects or vehicles.\n",
    "This is because we wish to investigate the safety performance of a vehicle in an extremely dangerous environments.\n",
    "Terminating episodes too frequently will let the training becomes too hard to complete.\n",
    "\n",
    "It is worth noting that there is a special termination condition *max_step*. It means if the agent exists in the scene for more than `env.config[\"horizon\"]` steps, it will be terminated. The lifetime of an agent is called *agent episode*. In single agent environment, when the agent reaches *max_step*, the environment will be reset and thus the lifetime of the environment, \"environment episode\", has the same length as *agent episode*. However, in Multi-agent environment, the controllable target vehicles consistently respawn in the scene if old target vehicles are terminated. The the environment will stop spawning new target vehicles if the length of *environment episode* exceeds `env.config[\"horizon]`. As the environment has to wait for the termination of all existing agents, the length of *environmental episode* will be greater than `env.config[\"horizon]` but less than 2*`env.config[\"horizon]`, while the length of *agent episode* is always less than `env.config[\"horizon]`. \n",
    "\n",
    "**Note: when an agent reaches max_step limit, the terminated signal received by `_,_,terminated,truncated,_=env.step(action)` will be False, but the truncated signal is True. To make the terminated signal=True when truncated is flagged, turn on `truncate_as_terminate` in env_config. However, we don't encourage this behavior, as [the gymnasium asks RL environments explicitly distinguish the timeout (truncation) and done (Termination)](https://gymnasium.farama.org/tutorials/gymnasium_basics/handling_time_limits/).**\n",
    "\n",
    "The following example help elaborate on the timeout mechanism used in MetaDrive. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5af30f13",
   "metadata": {},
   "outputs": [],
   "source": [
    "from metadrive.envs.metadrive_env import MetaDriveEnv\n",
    "\n",
    "def test_horizon(horizon, truncate_as_terminate):\n",
    "    env = MetaDriveEnv({\n",
    "            \"horizon\": horizon,\n",
    "            \"log_level\": 50,\n",
    "            \"truncate_as_terminate\": truncate_as_terminate})\n",
    "    o, _ = env.reset()\n",
    "    try:\n",
    "        for i in range(1, 1000):\n",
    "            _,_,tm,tc,_=env.step([0.,0.])\n",
    "            if tm or tc:\n",
    "                break\n",
    "    finally:\n",
    "        env.close()\n",
    "    return tm, tc, i\n",
    "\n",
    "tm, tc, epi_length = test_horizon(500, truncate_as_terminate=False)\n",
    "# tc when step==500, and no tm signal\n",
    "assert tc and not tm and epi_length==500 \n",
    "\n",
    "tm, tc, epi_length = test_horizon(500, truncate_as_terminate=True)\n",
    "# tc and tm when step==500\n",
    "assert tm == tc == True and epi_length==500\n",
    "\n",
    "tm, tc, epi_length = test_horizon(None, truncate_as_terminate=False)\n",
    "# no length limit in this case, and no tc and tm signals!\n",
    "assert tm == tc == False and epi_length > 500\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42e7c8a7",
   "metadata": {},
   "source": [
    "## Step Information\n",
    "\n",
    "The step information dict `info` contains rich information about current state of the environment and the target vehicle. \n",
    "The step info is collected from various sources such as the engine, reward function, termination function, traffic manager, agent manager and so on.\n",
    "We summarize the dict as follows:\n",
    "```\n",
    "    {\n",
    "        # Number of vehicles being overtaken by ego vehicle in this episode\n",
    "        'overtake_vehicle_num': 0,\n",
    "\n",
    "        # Current velocity in km/h\n",
    "        'velocity': 0.0,\n",
    "\n",
    "        # The current normalized steering signal in [-1, 1]\n",
    "        'steering': -0.06901532411575317,\n",
    "\n",
    "        # The current normalized acceleration signal in [-1, 1]\n",
    "        'acceleration': -0.2931942343711853,\n",
    "\n",
    "        # The normalized action after clipped who is applied to the ego vehicle\n",
    "        'raw_action': (-0.06901532411575317, -0.2931942343711853),\n",
    "\n",
    "        # Whether crash to vehicle / object / building\n",
    "        'crash_vehicle': False,\n",
    "        'crash_object': False,\n",
    "        'crash_building': False,\n",
    "        'crash': False,  # Whether any kind of crash happens\n",
    "\n",
    "        # Whether going out of the road / arrive destination\n",
    "        # or exceeding the maximal episode length\n",
    "        'out_of_road': False,\n",
    "        'arrive_dest': False,\n",
    "        'max_step': False,\n",
    "\n",
    "        # The reward in this time step / the whole episode so far\n",
    "        'step_reward': 0.0,\n",
    "        'episode_reward': 0.0,\n",
    "\n",
    "        # The cost in this time step\n",
    "        'cost': 0,\n",
    "\n",
    "        # The length of current episode\n",
    "        'episode_length': 1\n",
    "    }\n",
    "```\n",
    "\n",
    "The content of this dict keeps updating, and thus the content above may be out of date.\n",
    "We encourage users to write customized data to this dict, so more status can be exposed to monitor the simulation even without visualization.    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0a339830",
   "metadata": {},
   "source": [
    "## Customization\n",
    "To compose your own reward, cost and termination function. Just make a new environment and override the `reward_function`, `cost_function`, and `termination_function` of the base environment class. You can also record more information in `step_info` returned by these functions and deliver it outside the simulator."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b2a81768",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[38;20m[INFO] Environment: MyEnv\u001b[0m\n",
      "\u001b[38;20m[INFO] MetaDrive version: 0.4.1.2\u001b[0m\n",
      "\u001b[38;20m[INFO] Sensors: [lidar: Lidar(), side_detector: SideDetector(), lane_line_detector: LaneLineDetector()]\u001b[0m\n",
      "\u001b[38;20m[INFO] Render Mode: none\u001b[0m\n",
      "\u001b[38;20m[INFO] Horizon (Max steps per agent): None\u001b[0m\n",
      "\u001b[38;20m[INFO] Assets version: 0.4.1.2\u001b[0m\n",
      "\u001b[38;20m[INFO] Known Pipes: glxGraphicsPipe\u001b[0m\n",
      "\u001b[38;20m[INFO] Start Scenario Index: 0, Num Scenarios : 1\u001b[0m\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "reward: -10, `is_customized` in info: True\n"
     ]
    }
   ],
   "source": [
    "from metadrive.envs.metadrive_env import MetaDriveEnv\n",
    "\n",
    "class MyEnv(MetaDriveEnv):\n",
    "    \n",
    "    def reward_function(*args, **kwargs):\n",
    "        return -10, {\"is_customized\": True}\n",
    "    \n",
    "env=MyEnv()\n",
    "env.reset()\n",
    "_,r,_,_,info = env.step([0,0])\n",
    "assert r==-10 and info[\"is_customized\"]\n",
    "print(\"reward: {}, `is_customized` in info: {}\".format(r, info[\"is_customized\"]))\n",
    "env.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  },
  "mystnb": {
   "execution_mode": "force"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}