PytorchReachability/configs.yaml at main · CMU-IntentLab/PytorchReachability · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
defaults:
  # Dubins car environment parameters
  speed: 1.
  turnRate: 1.25
  x_min: -1.5
  x_max: 1.5
  y_min: -1.5
  y_max: 1.5
  buffer: 0.1
  dt: 0.05
  obs_x: 0
  obs_y: 0
  obs_r: 0.5

  # offline data parameters
  data_length: 100
  num_trajs: 4000 # 2000 in paper
  num_train_trajs: 3800 #1900 in paper
  size: [128, 128]
  dataset_path: "wm_demos128.pkl" # "128" should be the same as the size above

  # offline dreamer parameters, these are from https://github.com/NM512/dreamerv3-torch
  logdir: "logs/dreamer_dubins"
  rssm_ckpt_path: logs/dreamer_dubins/rssm_ckpt.pt # change this to whatever the path to your rssm checkpoint is
  traindir: null
  evaldir: null
  offline_traindir: ""
  offline_evaldir: ""
  seed: 0
  deterministic_run: False
  steps: 1e6
  parallel: False
  eval_every: 5e2
  eval_episode_num: 10
  log_every: 5e2
  reset_every: 0
  device: "cuda:0"
  compile: True
  precision: 32
  debug: False
  video_pred_log: True

  rssm_train_steps: 10000 # 100000 in paper

  # Environment
  task: "dubins-wm"
  envs: 1
  action_repeat: 1
  time_limit: 100
  grayscale: False
  prefill: 5000
  reward_EMA: True

  # Model
  dyn_hidden: 512
  dyn_deter: 512
  dyn_stoch: 32
  dyn_discrete: 0 # 0 for continuous latent
  dyn_rec_depth: 1
  dyn_mean_act: "none"
  dyn_std_act: "sigmoid2"
  dyn_min_std: 0.1
  grad_heads: ["decoder", "margin", "cont"]
  units: 512
  act: "SiLU"
  norm: True
  encoder:
    {mlp_keys: "obs_state", cnn_keys: "image", act: "SiLU", norm: True, cnn_depth: 32, kernel_size: 4, minres: 4, mlp_layers: 5, mlp_units: 1024, symlog_inputs: True}
  decoder:
    {mlp_keys: "obs_state", cnn_keys: "image", act: "SiLU", norm: True, cnn_depth: 32, kernel_size: 4, minres: 4, mlp_layers: 5, mlp_units: 1024, cnn_sigmoid: False, image_dist: mse, vector_dist: symlog_mse, outscale: 1.0}
  actor:
    {layers: 2, dist: "normal", entropy: 3e-4, unimix_ratio: 0.01, std: "learned", min_std: 0.1, max_std: 1.0, temp: 0.1, lr: 3e-5, eps: 1e-5, grad_clip: 100.0, outscale: 1.0}
  critic:
    {layers: 2, dist: "symlog_disc", slow_target: True, slow_target_update: 1, slow_target_fraction: 0.02, lr: 3e-5, eps: 1e-5, grad_clip: 100.0, outscale: 0.0}
  #reward_head:
  #  {layers: 2, dist: "symlog_disc", loss_scale: 1.0, outscale: 0.0}
  cont_head:
    {layers: 2, loss_scale: 1.0, outscale: 1.0}
  # new
  margin_head:
    {layers: 2, loss_scale: 10.0}
  gamma_lx: 0.75
  dyn_scale: 0.5
  rep_scale: 0.1
  kl_free: 1.0
  weight_decay: 0.0
  unimix_ratio: 0.01
  initial: "learned"

  # Training
  batch_size: 16
  batch_length: 64
  train_ratio: 512
  pretrain: 100
  model_lr: 1e-4
  obs_lr: 1e-3
  lx_lr: 1e-4
  opt_eps: 1e-8
  grad_clip: 1000
  dataset_size: 1000000
  opt: "adam"

  # Behavior.
  discount: 0.997
  discount_lambda: 0.95
  imag_horizon: 15
  imag_gradient: "dynamics"
  imag_gradient_mix: 0.0
  eval_state_mean: False

  # Exploration
  expl_behavior: "greedy"
  expl_until: 0
  expl_extr_scale: 0.0
  expl_intr_scale: 1.0
  disag_target: "stoch"
  disag_log: True
  disag_models: 10
  disag_offset: 1
  disag_layers: 4
  disag_units: 400
  disag_action_cond: False


  # LCRL
  reward-threshold: null
  #seed: 0
  buffer-size: 40000
  actor-lr: 1e-4
  critic-lr: 1e-3
  gamma-pyhj: 0.9999 # type=float, default=0.95)
  tau: 0.005 # type=float, default=0.005)
  exploration-noise: 0.1 # type=float, default=0.1)
  epoch: 1 # type=int, default=10)
  total-episodes: 15 # type=int, default=160)
  step-per-epoch: 40000 # type=int, default=40000)
  step-per-collect: 8 # type=int, default=8)
  update-per-step: 0.125 # type=float, default=0.125)
  batch_size-pyhj: 512 # type=int, default=512)
  control-net: [128, 128, 128] # type=int, nargs="*", default=None) # for control policy
  critic-net: [128, 128, 128]  # type=int, nargs="*", default=None) # for critic net
  training-num: 1 # type=int, default=8)
  test-num: 1 # type=int, default=100)
  render: 0. # type=float, default=0.)
  rew-norm: False # action="store_true", default=False)
  n-step: 1 # type=int, default=1)
  continue-training-logdir: null # type=str, default=None)
  continue-training-epoch: null # type=int, default=None)
  actor-gradient-steps: 1 # type=int, default=1)
  is-game-baseline: False # type=bool, default=False) # it will be set automatically
  target-update-freq: 400 # type=int, default=400)
  auto-alpha: 1
  alpha-lr: 3e-4
  alpha: 0.2
  weight-decay-pyhj: 0.001

  actor-activation: "ReLU" #type=str, default="ReLU")
  critic-activation: "ReLU" # type=str, default="ReLU")
  kwargs: {} # type=str, default="{}")
  warm-start-path: null # type=str, default=None) # e.g., log/ra_droneracing_Game-v6/epoch_id_10/policy.pth

  nx: 31
  ny: 31