Skip to content

Commit 93a13d3

Browse files
Epsilon Speedy Explorer (#1052)
* Add epsilon speedy explorer * Fix type stability * Add docstring * Finish docstring * Add epsilon-greedy explorer tests * add better prob tests --------- Co-authored-by: Jeremiah Lewis <--get>
1 parent 06cabb9 commit 93a13d3

File tree

12 files changed

+252
-5
lines changed

12 files changed

+252
-5
lines changed

src/ReinforcementLearningCore/Project.toml

+1-3
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ Metal = "1.0"
3737
ProgressMeter = "1"
3838
Reexport = "1"
3939
ReinforcementLearningBase = "0.12"
40-
ReinforcementLearningFarm = "0.0.1"
4140
ReinforcementLearningTrajectories = "0.3.7"
4241
Statistics = "1"
4342
StatsBase = "0.32, 0.33, 0.34"
@@ -53,10 +52,9 @@ Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
5352
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
5453
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
5554
ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921"
56-
ReinforcementLearningFarm = "14eff660-7080-4cec-bba2-cfb12cd77ac3"
5755
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
5856
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
5957
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
6058

6159
[targets]
62-
test = ["CommonRLInterface", "CUDA", "cuDNN", "DomainSets", "Metal", "Preferences", "ReinforcementLearningEnvironments", "ReinforcementLearningFarm", "Test", "UUIDs"]
60+
test = ["CommonRLInterface", "CUDA", "cuDNN", "DomainSets", "Metal", "Preferences", "ReinforcementLearningEnvironments", "Test", "UUIDs"]

src/ReinforcementLearningCore/src/policies/explorers/epsilon_greedy_explorer.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ end
133133
#####
134134

135135
"""
136-
prob(s::EpsilonGreedyExplorer, values) ->Categorical
137-
prob(s::EpsilonGreedyExplorer, values, mask) ->Categorical
136+
prob(s::EpsilonGreedyExplorer, values) -> Categorical
137+
prob(s::EpsilonGreedyExplorer, values, mask) -> Categorical
138138
139139
Return the probability of selecting each action given the estimated `values` of each action.
140140
"""
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
using Test
2+
using Distributions: Categorical
3+
using ReinforcementLearningCore: EpsilonGreedyExplorer, GreedyExplorer, get_ϵ
4+
using Random
5+
6+
@testset "EpsilonGreedyExplorer" begin
7+
@testset "get_ϵ for linear kind" begin
8+
@test get_ϵ(EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 50) 0.9
9+
@test get_ϵ(EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 100) 0.9
10+
@test get_ϵ(EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 150) 0.5
11+
@test get_ϵ(EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 200) 0.1
12+
end
13+
14+
@testset "get_ϵ for exp kind" begin
15+
@test get_ϵ(EpsilonGreedyExplorer(kind=:exp, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 50) 0.9
16+
@test get_ϵ(EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 100) 0.9
17+
@test get_ϵ(EpsilonGreedyExplorer(kind=:exp, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 150) 0.5852245277701068
18+
@test get_ϵ(EpsilonGreedyExplorer(kind=:exp, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100), 2000) 0.1 atol=1e-2
19+
end
20+
21+
@testset "EpsilonGreedyExplorer Tests" begin
22+
# Test plan! for is_break_tie=true
23+
rng = Random.default_rng(123)
24+
s = EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100, is_break_tie=true, rng=rng)
25+
values = [0.1, 0.5, 0.5, 0.3]
26+
actions = []
27+
for _ in 1:300
28+
push!(actions, RLBase.plan!(s, values))
29+
end
30+
@test length(unique(actions)) == 4
31+
end
32+
33+
@testset "EpsilonGreedyExplorer Tests" begin
34+
# Test plan! for is_break_tie=false
35+
rng = Random.default_rng(123)
36+
s = EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100, is_break_tie=false, rng=rng)
37+
values = [0.1, 0.5, 0.5, 0.3]
38+
actions = []
39+
for _ in 1:300
40+
push!(actions, RLBase.plan!(s, values))
41+
end
42+
@test length(unique(actions)) == 4
43+
end
44+
45+
@testset "prob for is_break_tie=true" begin
46+
s = EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100, is_break_tie=true)
47+
values = [0.1, 0.5, 0.5, 0.3]
48+
@test RLBase.prob(s, values) Categorical([0.225, 0.275, 0.275, 0.225])
49+
@test RLBase.prob(s, values, 2) 0.275
50+
end
51+
52+
@testset "prob for is_break_tie=false" begin
53+
s = EpsilonGreedyExplorer(kind=:linear, ϵ_init=0.9, ϵ_stable=0.1, warmup_steps=100, decay_steps=100, is_break_tie=false)
54+
values = [0.1, 0.5, 0.5, 0.3]
55+
@test RLBase.prob(s, values) Categorical([0.225, 0.32499999999999996, 0.225, 0.225])
56+
@test RLBase.prob(s, values, 2) 0.32500000000000007
57+
end
58+
end
59+
60+
@testset "GreedyExplorer" begin
61+
@testset "plan!" begin
62+
s = GreedyExplorer()
63+
values = [0.1, 0.5, 0.5, 0.3]
64+
@test RLBase.plan!(s, values) == 2
65+
end
66+
67+
@testset "prob" begin
68+
s = GreedyExplorer()
69+
values = [0.1, 0.5, 0.5, 0.3]
70+
@test RLBase.prob(s, values) Categorical([0.0, 1.0, 0.0, 0.0])
71+
@test RLBase.prob(s, values, 2) == 1.0
72+
end
73+
end
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
include("epsilon_greedy_explorer.jl")
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
include("agent.jl")
22
include("multi_agent.jl")
33
include("learners/learners.jl")
4+
include("explorers/explorers.jl")
45
include("q_based_policy.jl")

src/ReinforcementLearningFarm/Project.toml

+4
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,17 @@ version = "0.0.1"
55
[deps]
66
CircularArrayBuffers = "9de3a189-e0c0-4e15-ba3b-b14b9fb0aec1"
77
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
8+
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
89
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
910
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
11+
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1012
ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44"
1113
ReinforcementLearningCore = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
1214

1315
[compat]
16+
FillArrays = "1"
1417
CircularArrayBuffers = "0.1.12"
18+
Distributions = "0.25"
1519
ReinforcementLearningBase = "0.12"
1620
ReinforcementLearningCore = "0.14"
1721
ReinforcementLearningEnvironments = "0.8"
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
include("tabular/tabular.jl")
2+
include("explorers/explorers.jl")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
2+
using ReinforcementLearningCore
3+
using ReinforcementLearningBase
4+
import ReinforcementLearningBase: RLBase
5+
6+
using FillArrays: Trues
7+
using Random
8+
using Distributions: Categorical
9+
using Base
10+
11+
"""
12+
EpsilonSpeedyExplorer(β::Float64)
13+
14+
`EpsilonSpeedyExplorer` is an explorer that selects the action with the maximum value with probability `1 - ϵ` and selects a random action with probability `ϵ`.
15+
The probability of selecting a random action is given by `exp(β * -t)`, where `t` is the number of times `plan!` has been called.
16+
`EpsilonSpeedyExplorer` differs from `EpsilonGreedyExplorer` in that it uses the `exp` function to calculate the probability of selecting a random action over the full range of `t` and only accepts one argument, `β`.
17+
18+
"""
19+
struct EpsilonSpeedyExplorer{R} <: AbstractExplorer
20+
β::Float64
21+
β_neg::Float64
22+
step::Base.RefValue{Int}
23+
rng::R
24+
end
25+
26+
function EpsilonSpeedyExplorer::Float64)
27+
EpsilonSpeedyExplorer{typeof(Random.GLOBAL_RNG)}(
28+
β,
29+
β * -1,
30+
Ref(1),
31+
Random.GLOBAL_RNG,
32+
)
33+
end
34+
35+
function get_ϵ(s::EpsilonSpeedyExplorer)
36+
exp(s.β_neg * s.step[])
37+
end
38+
39+
"""
40+
RLBase.plan!(s::EpsilonSpeedyExplorer, values; step) where T
41+
42+
!!! note
43+
If multiple values with the same maximum value are found.
44+
Then a random one will be returned when `is_break_tie==true`.
45+
46+
`NaN` will be filtered unless all the values are `NaN`.
47+
In that case, a random one will be returned.
48+
"""
49+
function RLBase.plan!(s::EpsilonSpeedyExplorer{R}, values::A) where {I<:Real, A<:AbstractArray{I}, R<:Random.AbstractRNG}
50+
ϵ = get_ϵ(s)
51+
s.step[] += 1
52+
rand(s.rng) >= ϵ ? findmax(values)[2] : rand(s.rng, 1:length(values))
53+
end
54+
55+
RLBase.plan!(s::EpsilonSpeedyExplorer{R}, x::A, mask::Trues) where {I<:Real, A<:AbstractArray{I}, R<:Random.AbstractRNG} = RLBase.plan!(s, x)
56+
57+
function RLBase.plan!(s::EpsilonSpeedyExplorer{R}, values::A, mask::M) where {I<:Real, A<:AbstractArray{I}, M<:Union{BitVector, Vector{Bool}}, R<:Random.AbstractRNG}
58+
ϵ = get_ϵ(s)
59+
s.step[] += 1
60+
# NOTE: takes first max element, doesn't break ties randomly
61+
rand(s.rng) >= ϵ ? RLCore.findmax_masked(values, mask)[2] : rand(s.rng, findall(mask))
62+
end
63+
64+
"""
65+
prob(s::EpsilonGreedyExplorer, values) -> Categorical
66+
prob(s::EpsilonGreedyExplorer, values, mask) -> Categorical
67+
68+
Return the probability of selecting each action given the estimated `values` of each action.
69+
"""
70+
function RLBase.prob(s::EpsilonSpeedyExplorer, values::A) where {I<:Real, A<:AbstractArray{I}}
71+
ϵ, n = get_ϵ(s), length(values)
72+
probs = fill/ n, n)
73+
probs[findmax(values)[2]] += 1 - ϵ
74+
Categorical(probs; check_args=false)
75+
end
76+
77+
function RLBase.prob(s::EpsilonSpeedyExplorer, values::A, action::Integer) where {I<:Real, A<:AbstractArray{I}}
78+
ϵ, n = get_ϵ(s), length(values)
79+
if action == findmax(values)[2]
80+
ϵ / n + 1 - ϵ
81+
else
82+
ϵ / n
83+
end
84+
end
85+
86+
function RLBase.prob(s::EpsilonSpeedyExplorer, values::A, mask::M) where {I<:Real, A<:AbstractArray{I}, M<:Union{BitVector, Vector{Bool}}}
87+
ϵ, n = get_ϵ(s), length(values)
88+
probs = zeros(n)
89+
probs[mask] .= ϵ / sum(mask)
90+
probs[RLCore.findmax_masked(values, mask)[2]] += 1 - ϵ
91+
Categorical(probs; check_args=false)
92+
end
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
include("epsilon_speedy_explorer.jl")
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
include("tabular/tabular.jl")
2+
include("explorers/explorers.jl")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
using ReinforcementLearningFarm: EpsilonSpeedyExplorer, get_ϵ
2+
using Random
3+
4+
@testset "EpsilonSpeedyExplorer" begin
5+
using Test
6+
7+
@testset "EpsilonSpeedyExplorer" begin
8+
@testset "constructor" begin
9+
explorer = EpsilonSpeedyExplorer(0.1)
10+
@test explorer.β == 0.1
11+
@test explorer.β_neg == -0.1
12+
@test explorer.step[] == 1
13+
@test explorer.rng === Random.GLOBAL_RNG
14+
end
15+
16+
@testset "get_ϵ" begin
17+
explorer = EpsilonSpeedyExplorer(0.1)
18+
@test get_ϵ(explorer) exp(-0.1)
19+
explorer.step[] = 10
20+
@test get_ϵ(explorer) exp(-1.0)
21+
end
22+
23+
@testset "plan" begin
24+
explorer = EpsilonSpeedyExplorer(0.1)
25+
values = [1, 2, 3, 4, 5]
26+
mask = [true, false, true, false, true]
27+
28+
@testset "without mask" begin
29+
action = RLBase.plan!(explorer, values)
30+
@test action 1:length(values)
31+
end
32+
33+
@testset "with mask" begin
34+
action = RLBase.plan!(explorer, values, mask)
35+
@test action findall(mask)
36+
end
37+
38+
@testset "with true mask" begin
39+
true_mask = [true, true, true, true, true]
40+
action = RLBase.plan!(explorer, values, true_mask)
41+
@test action findall(true_mask)
42+
end
43+
end
44+
45+
@testset "prob" begin
46+
explorer = EpsilonSpeedyExplorer(0.1)
47+
values = [1, 2, 3, 4, 5]
48+
mask = [true, false, true, false, true]
49+
50+
@testset "without mask" begin
51+
prob_dist = RLBase.prob(explorer, values)
52+
@test prob_dist.p [0.1809674836071919, 0.1809674836071919, 0.1809674836071919, 0.1809674836071919, 0.2761300655712324]
53+
end
54+
55+
@testset "with mask" begin
56+
prob_dist = RLBase.prob(explorer, values, mask)
57+
@test prob_dist.p [0.30161247267865315, 0.0, 0.30161247267865315, 0.0, 0.39677505464269364]
58+
end
59+
60+
@testset "with true mask" begin
61+
true_mask = [true, true, true, true, true]
62+
prob_dist = RLBase.prob(explorer, values, true_mask)
63+
@test prob_dist.p [0.1809674836071919, 0.1809674836071919, 0.1809674836071919, 0.1809674836071919, 0.2761300655712324]
64+
end
65+
end
66+
end
67+
68+
@testset "EpsilonSpeedyExplorer correctness" begin
69+
explorer = RLFarm.EpsilonSpeedyExplorer(1e-5)
70+
explorer.step[] = Int(1e5)
71+
@test RLFarm.get_ϵ(explorer) 0.36787944117144233
72+
end
73+
end
74+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
include("epsilon_speedy_explorer.jl")

0 commit comments

Comments
 (0)