opendilab
diff --git a/‎.gitignore
Lines changed: 2 additions & 1 deletion b/‎.gitignore
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 19 additions & 18 deletions b/‎README.md
Lines changed: 19 additions & 18 deletions
diff --git a/‎README.zh.md
Lines changed: 18 additions & 19 deletions b/‎README.zh.md
Lines changed: 18 additions & 19 deletions
diff --git a/‎lzero/agent/alphazero.py
Lines changed: 2 additions & 2 deletions b/‎lzero/agent/alphazero.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎lzero/agent/efficientzero.py
Lines changed: 2 additions & 2 deletions b/‎lzero/agent/efficientzero.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎lzero/agent/gumbel_muzero.py
Lines changed: 2 additions & 2 deletions b/‎lzero/agent/gumbel_muzero.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎lzero/agent/muzero.py
Lines changed: 2 additions & 2 deletions b/‎lzero/agent/muzero.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎lzero/agent/sampled_alphazero.py
Lines changed: 2 additions & 2 deletions b/‎lzero/agent/sampled_alphazero.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎lzero/agent/sampled_efficientzero.py
Lines changed: 2 additions & 2 deletions b/‎lzero/agent/sampled_efficientzero.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎lzero/entry/__init__.py
Lines changed: 7 additions & 4 deletions b/‎lzero/entry/__init__.py
Lines changed: 7 additions & 4 deletions
diff --git a/‎lzero/entry/eval_muzero.py
Lines changed: 4 additions & 3 deletions b/‎lzero/entry/eval_muzero.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎lzero/entry/train_alphazero.py
Lines changed: 2 additions & 2 deletions b/‎lzero/entry/train_alphazero.py
Lines changed: 2 additions & 2 deletions
@@ -1421,7 +1421,7 @@ log*
 default*
 events.*
 
-# DI-engine special key
+# LightZero special key
 *default_logger.txt
 *default_tb_logger
 *evaluate.txt
@@ -1448,3 +1448,4 @@ events.*
 
 # pooltool-specific stuff
 !/assets/pooltool/**
+lzero/mcts/ctree/ctree_alphazero/pybind11
@@ -122,24 +122,25 @@ LightZero is a library with a [PyTorch](https://pytorch.org/) implementation of
 
 The environments and algorithms currently supported by LightZero are shown in the table below:
 
-| Env./Algo.    | AlphaZero | MuZero | EfficientZero | Sampled EfficientZero | Gumbel MuZero | Stochastic MuZero | 
-|---------------| -------- | ------ |-------------| ------------------ | ---------- |----------------|
-| TicTacToe     | ✔      | ✔      | 🔒           | 🔒                | ✔          | 🔒             |
-| Gomoku        | ✔      | ✔      | 🔒          | 🔒               | ✔          | 🔒             |
-| Connect4      | ✔      | ✔      | 🔒          | 🔒               | 🔒           | 🔒             |
-| 2048          | ---       | ✔      | 🔒            | 🔒                | 🔒           | ✔              |
-| Chess         | 🔒      | 🔒     | 🔒          | 🔒               | 🔒         | 🔒             |
-| Go            | 🔒      | 🔒     | 🔒          | 🔒               | 🔒         | 🔒             |
-| CartPole      | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |
-| Pendulum      | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |
-| LunarLander   | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |
-| BipedalWalker | ---      | ✔      | ✔           | ✔                | ✔          | 🔒              |
-| Atari         | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |
-| MuJoCo        | ---      | ✔     | ✔          | ✔                | 🔒         | 🔒               |
-| MiniGrid      | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
-| Bsuite        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
-| Memory        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
-| SumToThree (billiards) | ---      | 🔒     | 🔒          | ✔               | 🔒         | 🔒             |
+
+| Env./Algo.    | AlphaZero | MuZero | EfficientZero | Sampled EfficientZero | Gumbel MuZero | Stochastic MuZero | UniZero |ReZero |
+|---------------| -------- | ------ |-------------| ------------------ | ---------- |----------------|---------------|----------------|
+| TicTacToe     | ✔      | ✔      | 🔒           | 🔒                | ✔          | 🔒             |✔|🔒             |
+| Gomoku        | ✔      | ✔      | 🔒          | 🔒               | ✔          | 🔒             |✔|✔          |
+| Connect4      | ✔      | ✔      | 🔒          | 🔒               | 🔒           | 🔒             |✔|✔          |
+| 2048          | ---       | ✔      | 🔒            | 🔒                | 🔒           | ✔              |✔|🔒             |
+| Chess         | 🔒      | 🔒     | 🔒          | 🔒               | 🔒         | 🔒             |🔒|🔒             |
+| Go            | 🔒      | 🔒     | 🔒          | 🔒               | 🔒         | 🔒             |🔒|🔒             |
+| CartPole      | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |✔|✔             |
+| Pendulum      | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |🔒|🔒             |
+| LunarLander   | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |✔|🔒             |
+| BipedalWalker | ---      | ✔      | ✔           | ✔                | ✔          | 🔒              |🔒|🔒             |
+| Atari         | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |✔|✔          |
+| MuJoCo        | ---      | ✔     | ✔          | ✔                | 🔒         | 🔒               |🔒|🔒             |
+| MiniGrid      | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |✔|🔒             |
+| Bsuite        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |✔|🔒             |
+| Memory        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |✔|🔒             |
+| SumToThree (billiards) | ---      | 🔒     | 🔒          | ✔               | 🔒         | 🔒             |🔒|🔒             |
 
 
 <sup>(1): "✔" means that the corresponding item is finished and well-tested.</sup>
 
@@ -110,25 +110,24 @@ LightZero 是基于 [PyTorch](https://pytorch.org/) 实现的 MCTS 算法库，
 
 LightZero 目前支持的环境及算法如下表所示：
 
-| Env./Algo.    | AlphaZero | MuZero | EfficientZero | Sampled EfficientZero | Gumbel MuZero | Stochastic MuZero | 
-|---------------| -------- | ------ |-------------| ------------------ | ---------- |----------------|
-| TicTacToe     | ✔      | ✔      | 🔒           | 🔒                | ✔          | 🔒             |
-| Gomoku        | ✔      | ✔      | 🔒          | 🔒               | ✔          | 🔒             |
-| Connect4      | ✔      | ✔      | 🔒          | 🔒               | 🔒           | 🔒             |
-| 2048          | ---       | ✔      | 🔒            | 🔒                | 🔒           | ✔              |
-| Chess         | 🔒      | 🔒     | 🔒          | 🔒               | 🔒         | 🔒             |
-| Go            | 🔒      | 🔒     | 🔒          | 🔒               | 🔒         | 🔒             |
-| CartPole      | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |
-| Pendulum      | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |
-| LunarLander   | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |
-| BipedalWalker | ---      | ✔      | ✔           | ✔                | ✔          | 🔒              |
-| Atari         | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |
-| MuJoCo        | ---      | ✔     | ✔          | ✔                | 🔒         | 🔒               |
-| MiniGrid      | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
-| Bsuite        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
-| Memory        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
-| SumToThree (billiards) | ---      | 🔒     | 🔒          | ✔               | 🔒         | 🔒             |
-
+| Env./Algo.    | AlphaZero | MuZero | EfficientZero | Sampled EfficientZero | Gumbel MuZero | Stochastic MuZero | UniZero |ReZero |
+|---------------| -------- | ------ |-------------| ------------------ | ---------- |----------------|---------------|----------------|
+| TicTacToe     | ✔      | ✔      | 🔒           | 🔒                | ✔          | 🔒             |✔|🔒             |
+| Gomoku        | ✔      | ✔      | 🔒          | 🔒               | ✔          | 🔒             |✔|✔          |
+| Connect4      | ✔      | ✔      | 🔒          | 🔒               | 🔒           | 🔒             |✔|✔          |
+| 2048          | ---       | ✔      | 🔒            | 🔒                | 🔒           | ✔              |✔|🔒             |
+| Chess         | 🔒      | 🔒     | 🔒          | 🔒               | 🔒         | 🔒             |🔒|🔒             |
+| Go            | 🔒      | 🔒     | 🔒          | 🔒               | 🔒         | 🔒             |🔒|🔒             |
+| CartPole      | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |✔|✔             |
+| Pendulum      | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |🔒|🔒             |
+| LunarLander   | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |✔|🔒             |
+| BipedalWalker | ---      | ✔      | ✔           | ✔                | ✔          | 🔒              |🔒|🔒             |
+| Atari         | ---      | ✔      | ✔           | ✔                | ✔          | ✔              |✔|✔          |
+| MuJoCo        | ---      | ✔     | ✔          | ✔                | 🔒         | 🔒               |🔒|🔒             |
+| MiniGrid      | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |✔|🔒             |
+| Bsuite        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |✔|🔒             |
+| Memory        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |✔|🔒             |
+| SumToThree (billiards) | ---      | 🔒     | 🔒          | ✔               | 🔒         | 🔒             |🔒|🔒             |
 
 <sup>(1): "✔" 表示对应的项目已经完成并经过良好的测试。</sup>
 
 
@@ -198,9 +198,9 @@ def train(
             new_data = sum(new_data, [])
 
             if self.cfg.policy.update_per_collect is None:
-                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the model_update_ratio.
+                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the replay_ratio.
                 collected_transitions_num = len(new_data)
-                update_per_collect = int(collected_transitions_num * self.cfg.policy.model_update_ratio)
+                update_per_collect = int(collected_transitions_num * self.cfg.policy.replay_ratio)
             replay_buffer.push(new_data, cur_collector_envstep=collector.envstep)
 
             # Learn policy from collected data
 
@@ -228,9 +228,9 @@ def train(
             # Collect data by default config n_sample/n_episode.
             new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs)
             if self.cfg.policy.update_per_collect is None:
-                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the model_update_ratio.
+                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the replay_ratio.
                 collected_transitions_num = sum([len(game_segment) for game_segment in new_data[0]])
-                update_per_collect = int(collected_transitions_num * self.cfg.policy.model_update_ratio)
+                update_per_collect = int(collected_transitions_num * self.cfg.policy.replay_ratio)
             # save returned new_data collected by the collector
             replay_buffer.push_game_segments(new_data)
             # remove the oldest data if the replay buffer is full.
 
@@ -228,9 +228,9 @@ def train(
             # Collect data by default config n_sample/n_episode.
             new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs)
             if self.cfg.policy.update_per_collect is None:
-                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the model_update_ratio.
+                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the replay_ratio.
                 collected_transitions_num = sum([len(game_segment) for game_segment in new_data[0]])
-                update_per_collect = int(collected_transitions_num * self.cfg.policy.model_update_ratio)
+                update_per_collect = int(collected_transitions_num * self.cfg.policy.replay_ratio)
             # save returned new_data collected by the collector
             replay_buffer.push_game_segments(new_data)
             # remove the oldest data if the replay buffer is full.
 
@@ -228,9 +228,9 @@ def train(
             # Collect data by default config n_sample/n_episode.
             new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs)
             if self.cfg.policy.update_per_collect is None:
-                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the model_update_ratio.
+                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the replay_ratio.
                 collected_transitions_num = sum([len(game_segment) for game_segment in new_data[0]])
-                update_per_collect = int(collected_transitions_num * self.cfg.policy.model_update_ratio)
+                update_per_collect = int(collected_transitions_num * self.cfg.policy.replay_ratio)
             # save returned new_data collected by the collector
             replay_buffer.push_game_segments(new_data)
             # remove the oldest data if the replay buffer is full.
 
@@ -198,9 +198,9 @@ def train(
             new_data = sum(new_data, [])
 
             if self.cfg.policy.update_per_collect is None:
-                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the model_update_ratio.
+                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the replay_ratio.
                 collected_transitions_num = len(new_data)
-                update_per_collect = int(collected_transitions_num * self.cfg.policy.model_update_ratio)
+                update_per_collect = int(collected_transitions_num * self.cfg.policy.replay_ratio)
             replay_buffer.push(new_data, cur_collector_envstep=collector.envstep)
 
             # Learn policy from collected data
 
@@ -228,9 +228,9 @@ def train(
             # Collect data by default config n_sample/n_episode.
             new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs)
             if self.cfg.policy.update_per_collect is None:
-                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the model_update_ratio.
+                # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the replay_ratio.
                 collected_transitions_num = sum([len(game_segment) for game_segment in new_data[0]])
-                update_per_collect = int(collected_transitions_num * self.cfg.policy.model_update_ratio)
+                update_per_collect = int(collected_transitions_num * self.cfg.policy.replay_ratio)
             # save returned new_data collected by the collector
             replay_buffer.push_game_segments(new_data)
             # remove the oldest data if the replay buffer is full.
 
@@ -1,7 +1,10 @@
-from .train_alphazero import train_alphazero
 from .eval_alphazero import eval_alphazero
-from .train_muzero import train_muzero
-from .train_muzero_with_reward_model import train_muzero_with_reward_model
 from .eval_muzero import eval_muzero
 from .eval_muzero_with_gym_env import eval_muzero_with_gym_env
-from .train_muzero_with_gym_env import train_muzero_with_gym_env
+from .train_alphazero import train_alphazero
+from .train_muzero import train_muzero
+from .train_muzero_with_gym_env import train_muzero_with_gym_env
+from .train_muzero_with_gym_env import train_muzero_with_gym_env
+from .train_muzero_with_reward_model import train_muzero_with_reward_model
+from .train_rezero import train_rezero
+from .train_unizero import train_unizero
@@ -13,6 +13,7 @@
 from ding.utils import set_pkg_seed
 from ding.worker import BaseLearner
 from lzero.worker import MuZeroEvaluator
+from lzero.entry.utils import initialize_zeros_batch
 
 
 def eval_muzero(
@@ -25,7 +26,7 @@ def eval_muzero(
 ) -> 'Policy':  # noqa
     """
     Overview:
-        The eval entry for MCTS+RL algorithms, including MuZero, EfficientZero, Sampled EfficientZero.
+        The eval entry for MCTS+RL algorithms, including MuZero, EfficientZero, Sampled EfficientZero, StochasticMuZero, GumbelMuZero, UniZero, etc.
     Arguments:
         - input_cfg (:obj:`Tuple[dict, dict]`): Config in dict type.
             ``Tuple[dict, dict]`` type means [user_config, create_cfg].
@@ -38,8 +39,8 @@ def eval_muzero(
         - policy (:obj:`Policy`): Converged policy.
     """
     cfg, create_cfg = input_cfg
-    assert create_cfg.policy.type in ['efficientzero', 'muzero', 'stochastic_muzero', 'gumbel_muzero', 'sampled_efficientzero'], \
-        "LightZero now only support the following algo.: 'efficientzero', 'muzero', 'stochastic_muzero', 'gumbel_muzero', 'sampled_efficientzero'"
+    assert create_cfg.policy.type in ['efficientzero', 'muzero', 'muzero_context', 'muzero_rnn_full_obs', 'stochastic_muzero', 'gumbel_muzero', 'sampled_efficientzero', 'unizero'], \
+        "LightZero now only support the following algo.: 'efficientzero', 'muzero', 'muzero_context', 'muzero_rnn_full_obs', 'stochastic_muzero', 'gumbel_muzero', 'sampled_efficientzero', 'unizero'"
 
     if cfg.policy.cuda and torch.cuda.is_available():
         cfg.policy.device = 'cuda'
 
@@ -119,9 +119,9 @@ def train_alphazero(
         new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs)
         new_data = sum(new_data, [])
         if cfg.policy.update_per_collect is None:
-            # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the model_update_ratio.
+            # update_per_collect is None, then update_per_collect is set to the number of collected transitions multiplied by the replay_ratio.
             collected_transitions_num = len(new_data)
-            update_per_collect = int(collected_transitions_num * cfg.policy.model_update_ratio)
+            update_per_collect = int(collected_transitions_num * cfg.policy.replay_ratio)
         replay_buffer.push(new_data, cur_collector_envstep=collector.envstep)
 
         # Learn policy from collected data