Epsilon Speedy Explorer (#1052)

jeremiahpslewis · web-flow · commit 93a13d382ee3 · 2024-03-21T11:14:23.000+01:00
* Add epsilon speedy explorer

* Fix type stability

* Add docstring

* Finish docstring

* Add epsilon-greedy explorer tests

* add better prob tests

---------

Co-authored-by: Jeremiah Lewis &lt;--get&gt;
diff --git a/src/ReinforcementLearningCore/Project.toml b/src/ReinforcementLearningCore/Project.toml
@@ -37,7 +37,6 @@ Metal = "1.0"
 ProgressMeter = "1"
 Reexport = "1"
 ReinforcementLearningBase = "0.12"
-ReinforcementLearningFarm = "0.0.1"
 ReinforcementLearningTrajectories = "0.3.7"
 Statistics = "1"
 StatsBase = "0.32, 0.33, 0.34"
@@ -53,10 +52,9 @@ Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921"
-ReinforcementLearningFarm = "14eff660-7080-4cec-bba2-cfb12cd77ac3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [targets]
-test = ["CommonRLInterface", "CUDA", "cuDNN", "DomainSets", "Metal", "Preferences", "ReinforcementLearningEnvironments", "ReinforcementLearningFarm", "Test", "UUIDs"]
+test = ["CommonRLInterface", "CUDA", "cuDNN", "DomainSets", "Metal", "Preferences", "ReinforcementLearningEnvironments", "Test", "UUIDs"]
diff --git a/src/ReinforcementLearningCore/src/policies/explorers/epsilon_greedy_explorer.jl b/src/ReinforcementLearningCore/src/policies/explorers/epsilon_greedy_explorer.jl
@@ -133,8 +133,8 @@ end
 #####
 
 """
-    prob(s::EpsilonGreedyExplorer, values) ->Categorical
-    prob(s::EpsilonGreedyExplorer, values, mask) ->Categorical
+    prob(s::EpsilonGreedyExplorer, values) -> Categorical
+    prob(s::EpsilonGreedyExplorer, values, mask) -> Categorical
 
 Return the probability of selecting each action given the estimated `values` of each action.
 """
diff --git a/src/ReinforcementLearningCore/test/policies/explorers/epsilon_greedy_explorer.jl b/src/ReinforcementLearningCore/test/policies/explorers/epsilon_greedy_explorer.jl
@@ -0,0 +1,73 @@
+using Test
+using Distributions: Categorical
+using ReinforcementLearningCore: EpsilonGreedyExplorer, GreedyExplorer, get_ϵ
+using Random
+
+@testset "EpsilonGreedyExplorer" begin
+    @testset "get_ϵ for linear kind" begin
+        @test get_ϵ(EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 50) ≈ 0.9
+        @test get_ϵ(EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 100) ≈ 0.9
+        @test get_ϵ(EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 150) ≈ 0.5
+        @test get_ϵ(EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 200) ≈ 0.1
+    end
+
+    @testset "get_ϵ for exp kind" begin
+        @test get_ϵ(EpsilonGreedyExplorer(kind=:exp, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 50) ≈ 0.9
+        @test get_ϵ(EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 100) ≈ 0.9
+        @test get_ϵ(EpsilonGreedyExplorer(kind=:exp, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 150) ≈ 0.5852245277701068
+        @test get_ϵ(EpsilonGreedyExplorer(kind=:exp, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 2000) ≈ 0.1 atol=1e-2
+    end
+
+    @testset "EpsilonGreedyExplorer Tests" begin
+        # Test plan! for is_break_tie=true
+        rng = Random.default_rng(123)
+        s = EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100, is_break_tie=true, rng=rng)
+        values = [0.1, 0.5, 0.5, 0.3]
+        actions = []
+        for _ in 1:300
+            push!(actions, RLBase.plan!(s, values))
+        end
+        @test length(unique(actions)) == 4
+    end
+
+    @testset "EpsilonGreedyExplorer Tests" begin
+        # Test plan! for is_break_tie=false
+        rng = Random.default_rng(123)
+        s = EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100, is_break_tie=false, rng=rng)
+        values = [0.1, 0.5, 0.5, 0.3]
+        actions = []
+        for _ in 1:300
+            push!(actions, RLBase.plan!(s, values))
+        end
+        @test length(unique(actions)) == 4
+    end
+
+    @testset "prob for is_break_tie=true" begin
+        s = EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100, is_break_tie=true)
+        values = [0.1, 0.5, 0.5, 0.3]
+        @test RLBase.prob(s, values) ≈ Categorical([0.225, 0.275, 0.275, 0.225])
+        @test RLBase.prob(s, values, 2) ≈ 0.275
+    end
+
+    @testset "prob for is_break_tie=false" begin
+        s = EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100, is_break_tie=false)
+        values = [0.1, 0.5, 0.5, 0.3]
+        @test RLBase.prob(s, values) ≈ Categorical([0.225, 0.32499999999999996, 0.225, 0.225])
+        @test RLBase.prob(s, values, 2) ≈ 0.32500000000000007
+    end
+end
+
+@testset "GreedyExplorer" begin
+    @testset "plan!" begin
+        s = GreedyExplorer()
+        values = [0.1, 0.5, 0.5, 0.3]
+        @test RLBase.plan!(s, values) == 2
+    end
+
+    @testset "prob" begin
+        s = GreedyExplorer()
+        values = [0.1, 0.5, 0.5, 0.3]
+        @test RLBase.prob(s, values) ≈ Categorical([0.0, 1.0, 0.0, 0.0])
+        @test RLBase.prob(s, values, 2) == 1.0
+    end
+end
diff --git a/src/ReinforcementLearningCore/test/policies/explorers/explorers.jl b/src/ReinforcementLearningCore/test/policies/explorers/explorers.jl
@@ -0,0 +1 @@
+include("epsilon_greedy_explorer.jl")
diff --git a/src/ReinforcementLearningCore/test/policies/policies.jl b/src/ReinforcementLearningCore/test/policies/policies.jl
@@ -1,4 +1,5 @@
 include("agent.jl")
 include("multi_agent.jl")
 include("learners/learners.jl")
+include("explorers/explorers.jl")
 include("q_based_policy.jl")
diff --git a/src/ReinforcementLearningFarm/Project.toml b/src/ReinforcementLearningFarm/Project.toml
@@ -5,13 +5,17 @@ version = "0.0.1"
 [deps]
 CircularArrayBuffers = "9de3a189-e0c0-4e15-ba3b-b14b9fb0aec1"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44"
 ReinforcementLearningCore = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
 
 [compat]
+FillArrays = "1"
 CircularArrayBuffers = "0.1.12"
+Distributions = "0.25"
 ReinforcementLearningBase = "0.12"
 ReinforcementLearningCore = "0.14"
 ReinforcementLearningEnvironments = "0.8"
diff --git a/src/ReinforcementLearningFarm/src/algorithms/algorithms.jl b/src/ReinforcementLearningFarm/src/algorithms/algorithms.jl
@@ -1 +1,2 @@
 include("tabular/tabular.jl")
+include("explorers/explorers.jl")
diff --git a/src/ReinforcementLearningFarm/src/algorithms/explorers/epsilon_speedy_explorer.jl b/src/ReinforcementLearningFarm/src/algorithms/explorers/epsilon_speedy_explorer.jl
@@ -0,0 +1,92 @@
+
+using ReinforcementLearningCore
+using ReinforcementLearningBase
+import ReinforcementLearningBase: RLBase
+
+using FillArrays: Trues
+using Random
+using Distributions: Categorical
+using Base
+
+"""
+    EpsilonSpeedyExplorer(β::Float64)
+
+`EpsilonSpeedyExplorer` is an explorer that selects the action with the maximum value with probability `1 - ϵ` and selects a random action with probability `ϵ`.
+The probability of selecting a random action is given by `exp(β * -t)`, where `t` is the number of times `plan!` has been called.
+`EpsilonSpeedyExplorer` differs from `EpsilonGreedyExplorer` in that it uses the `exp` function to calculate the probability of selecting a random action over the full range of `t` and only accepts one argument, `β`.
+
+"""
+struct EpsilonSpeedyExplorer{R} <: AbstractExplorer
+    β::Float64
+    β_neg::Float64
+    step::Base.RefValue{Int}
+    rng::R
+end
+
+function EpsilonSpeedyExplorer(β::Float64)
+    EpsilonSpeedyExplorer{typeof(Random.GLOBAL_RNG)}(
+        β,
+        β * -1,
+        Ref(1),
+        Random.GLOBAL_RNG,
+    )
+end
+
+function get_ϵ(s::EpsilonSpeedyExplorer)
+    exp(s.β_neg * s.step[])
+end
+
+"""
+    RLBase.plan!(s::EpsilonSpeedyExplorer, values; step) where T
+
+!!! note
+    If multiple values with the same maximum value are found.
+    Then a random one will be returned when `is_break_tie==true`.
+
+    `NaN` will be filtered unless all the values are `NaN`.
+    In that case, a random one will be returned.
+"""
+function RLBase.plan!(s::EpsilonSpeedyExplorer{R}, values::A) where {I<:Real, A<:AbstractArray{I}, R<:Random.AbstractRNG}
+    ϵ = get_ϵ(s)
+    s.step[] += 1
+    rand(s.rng) >= ϵ ? findmax(values)[2] : rand(s.rng, 1:length(values))
+end
+
+RLBase.plan!(s::EpsilonSpeedyExplorer{R}, x::A, mask::Trues) where {I<:Real, A<:AbstractArray{I}, R<:Random.AbstractRNG} = RLBase.plan!(s, x)
+
+function RLBase.plan!(s::EpsilonSpeedyExplorer{R}, values::A, mask::M) where {I<:Real, A<:AbstractArray{I}, M<:Union{BitVector, Vector{Bool}}, R<:Random.AbstractRNG}
+    ϵ = get_ϵ(s)
+    s.step[] += 1
+    # NOTE: takes first max element, doesn't break ties randomly
+    rand(s.rng) >= ϵ ? RLCore.findmax_masked(values, mask)[2] : rand(s.rng, findall(mask))
+end
+
+"""
+    prob(s::EpsilonGreedyExplorer, values) -> Categorical
+    prob(s::EpsilonGreedyExplorer, values, mask) -> Categorical
+
+Return the probability of selecting each action given the estimated `values` of each action.
+"""
+function RLBase.prob(s::EpsilonSpeedyExplorer, values::A) where {I<:Real, A<:AbstractArray{I}}
+    ϵ, n = get_ϵ(s), length(values)
+    probs = fill(ϵ / n, n)
+    probs[findmax(values)[2]] += 1 - ϵ
+    Categorical(probs; check_args=false)
+end
+
+function RLBase.prob(s::EpsilonSpeedyExplorer, values::A, action::Integer) where {I<:Real, A<:AbstractArray{I}}
+    ϵ, n = get_ϵ(s), length(values)
+    if action == findmax(values)[2]
+        ϵ / n + 1 - ϵ
+    else
+        ϵ / n
+    end
+end
+
+function RLBase.prob(s::EpsilonSpeedyExplorer, values::A, mask::M) where {I<:Real, A<:AbstractArray{I}, M<:Union{BitVector, Vector{Bool}}}
+    ϵ, n = get_ϵ(s), length(values)
+    probs = zeros(n)
+    probs[mask] .= ϵ / sum(mask)
+    probs[RLCore.findmax_masked(values, mask)[2]] += 1 - ϵ
+    Categorical(probs; check_args=false)
+end
diff --git a/src/ReinforcementLearningFarm/src/algorithms/explorers/explorers.jl b/src/ReinforcementLearningFarm/src/algorithms/explorers/explorers.jl
@@ -0,0 +1 @@
+include("epsilon_speedy_explorer.jl")
diff --git a/src/ReinforcementLearningFarm/test/algorithms/algorithms.jl b/src/ReinforcementLearningFarm/test/algorithms/algorithms.jl
@@ -1 +1,2 @@
 include("tabular/tabular.jl")
+include("explorers/explorers.jl")
diff --git a/src/ReinforcementLearningFarm/test/algorithms/explorers/epsilon_speedy_explorer.jl b/src/ReinforcementLearningFarm/test/algorithms/explorers/epsilon_speedy_explorer.jl
@@ -0,0 +1,74 @@
+using ReinforcementLearningFarm: EpsilonSpeedyExplorer, get_ϵ
+using Random
+
+@testset "EpsilonSpeedyExplorer" begin
+    using Test
+
+    @testset "EpsilonSpeedyExplorer" begin
+        @testset "constructor" begin
+            explorer = EpsilonSpeedyExplorer(0.1)
+            @test explorer.β == 0.1
+            @test explorer.β_neg == -0.1
+            @test explorer.step[] == 1
+            @test explorer.rng === Random.GLOBAL_RNG
+        end
+    
+        @testset "get_ϵ" begin
+            explorer = EpsilonSpeedyExplorer(0.1)
+            @test get_ϵ(explorer) ≈ exp(-0.1)
+            explorer.step[] = 10
+            @test get_ϵ(explorer) ≈ exp(-1.0)
+        end
+    
+        @testset "plan" begin
+            explorer = EpsilonSpeedyExplorer(0.1)
+            values = [1, 2, 3, 4, 5]
+            mask = [true, false, true, false, true]
+    
+            @testset "without mask" begin
+                action = RLBase.plan!(explorer, values)
+                @test action ∈ 1:length(values)
+            end
+    
+            @testset "with mask" begin
+                action = RLBase.plan!(explorer, values, mask)
+                @test action ∈ findall(mask)
+            end
+    
+            @testset "with true mask" begin
+                true_mask = [true, true, true, true, true]
+                action = RLBase.plan!(explorer, values, true_mask)
+                @test action ∈ findall(true_mask)
+            end
+        end
+    
+        @testset "prob" begin
+            explorer = EpsilonSpeedyExplorer(0.1)
+            values = [1, 2, 3, 4, 5]
+            mask = [true, false, true, false, true]
+    
+            @testset "without mask" begin
+                prob_dist = RLBase.prob(explorer, values)
+                @test prob_dist.p ≈  [0.1809674836071919, 0.1809674836071919, 0.1809674836071919, 0.1809674836071919, 0.2761300655712324]
+            end
+    
+            @testset "with mask" begin
+                prob_dist = RLBase.prob(explorer, values, mask)
+                @test prob_dist.p ≈ [0.30161247267865315, 0.0, 0.30161247267865315, 0.0, 0.39677505464269364]
+            end
+    
+            @testset "with true mask" begin
+                true_mask = [true, true, true, true, true]
+                prob_dist = RLBase.prob(explorer, values, true_mask)
+                @test prob_dist.p ≈ [0.1809674836071919, 0.1809674836071919, 0.1809674836071919, 0.1809674836071919, 0.2761300655712324]
+            end
+        end
+    end
+    
+    @testset "EpsilonSpeedyExplorer correctness" begin
+        explorer = RLFarm.EpsilonSpeedyExplorer(1e-5)
+        explorer.step[] = Int(1e5)
+        @test RLFarm.get_ϵ(explorer) ≈ 0.36787944117144233
+    end
+end
+
diff --git a/src/ReinforcementLearningFarm/test/algorithms/explorers/explorers.jl b/src/ReinforcementLearningFarm/test/algorithms/explorers/explorers.jl
@@ -0,0 +1 @@
+include("epsilon_speedy_explorer.jl")

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`include("tabular/tabular.jl")`
	`2`	`+include("explorers/explorers.jl")`