diff --git a/Project.toml b/Project.toml index 3df4743cb..a2dd593ea 100644 --- a/Project.toml +++ b/Project.toml @@ -20,6 +20,7 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Polyhedra = "67491407-f73d-577b-9b50-8179a7c68029" Quaternions = "94ee1d12-ae83-5a48-8b1c-48b8ff168ae0" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44" Scratch = "6c6a2e73-6563-6170-7368-637461726353" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" diff --git a/environments/environment.jl b/environments/environment.jl index 46500d27f..94d1ddb8e 100644 --- a/environments/environment.jl +++ b/environments/environment.jl @@ -31,7 +31,7 @@ mutable struct Environment{X,T,M,A,O,I} dynamics_jacobian_state::Matrix{T} dynamics_jacobian_input::Matrix{T} input_previous::Vector{T} - control_map::Matrix{T} + control_map::Matrix{T} num_states::Int num_inputs::Int num_observations::Int @@ -66,33 +66,33 @@ end attitude_decompress: flag for pre- and post-concatenating Jacobians with attitude Jacobians """ function Base.step(env::Environment, x, u; - gradients=false, - attitude_decompress=false) + gradients = false, + attitude_decompress = false) mechanism = env.mechanism - timestep= mechanism.timestep + timestep = mechanism.timestep x0 = x # u = clip(env.input_space, u) # control limits env.input_previous .= u # for rendering in Gym - u_scaled = env.control_map * u + u_scaled = env.control_map * u z0 = env.representation == :minimal ? minimal_to_maximal(mechanism, x0) : x0 - z1 = step!(mechanism, z0, u_scaled; opts=env.opts_step) + z1 = step!(mechanism, z0, u_scaled; opts = env.opts_step) env.state .= env.representation == :minimal ? maximal_to_minimal(mechanism, z1) : z1 # Compute cost costs = cost(env, x, u) - # Check termination - done = is_done(env, x) + # Check termination + done = is_done(env, x) # Gradients if gradients if env.representation == :minimal - fx, fu = get_minimal_gradients!(env.mechanism, z0, u_scaled, opts=env.opts_grad) + fx, fu = get_minimal_gradients!(env.mechanism, z0, u_scaled, opts = env.opts_grad) elseif env.representation == :maximal - fx, fu = get_maximal_gradients!(env.mechanism, z0, u_scaled, opts=env.opts_grad) + fx, fu = get_maximal_gradients!(env.mechanism, z0, u_scaled, opts = env.opts_grad) if attitude_decompress A0 = attitude_jacobian(z0, length(env.mechanism.bodies)) A1 = attitude_jacobian(z1, length(env.mechanism.bodies)) @@ -109,11 +109,11 @@ function Base.step(env::Environment, x, u; end function Base.step(env::Environment, u; - gradients=false, - attitude_decompress=false) - step(env, env.state, u; - gradients=gradients, - attitude_decompress=attitude_decompress) + gradients = false, + attitude_decompress = false) + step(env, env.state, u; + gradients = gradients, + attitude_decompress = attitude_decompress) end """ @@ -156,7 +156,7 @@ is_done(env::Environment, x) = false x: state """ function Base.reset(env::Environment{X}; - x=nothing) where X + x = nothing) where {X} initialize!(env.mechanism, type2symbol(X)) if x != nothing @@ -172,15 +172,15 @@ function Base.reset(env::Environment{X}; return get_observation(env) end -function MeshCat.render(env::Environment, - mode="human") +function MeshCat.render(env::Environment, + mode = "human") z = env.representation == :minimal ? minimal_to_maximal(env.mechanism, env.state) : env.state - set_robot(env.vis, env.mechanism, z, name=:robot) + set_robot(env.vis, env.mechanism, z, name = :robot) return nothing end -function seed(env::Environment; s=0) - env.rng[1] = MersenneTwister(seed) +function seed(env::Environment, s = 0) + env.rng[1] = MersenneTwister(s) return nothing end @@ -196,7 +196,7 @@ end abstract type Space{T,N} end """ - BoxSpace{T,N} <: Environment{T,N} + BoxSpace{T,N} <: Space{T,N} domain with lower and upper limits @@ -214,12 +214,12 @@ mutable struct BoxSpace{T,N} <: Space{T,N} dtype::DataType # this is always T, it's needed to interface with Stable-Baselines end -function BoxSpace(n::Int; low::AbstractVector{T} = -ones(n), high::AbstractVector{T} = ones(n)) where T +function BoxSpace(n::Int; low::AbstractVector{T} = -ones(n), high::AbstractVector{T} = ones(n)) where {T} return BoxSpace{T,n}(n, low, high, (n,), T) end function sample(s::BoxSpace{T,N}) where {T,N} - return rand(T,N) .* (s.high .- s.low) .+ s.low + return rand(T, N) .* (s.high .- s.low) .+ s.low end function contains(s::BoxSpace{T,N}, v::AbstractVector{T}) where {T,N} @@ -230,5 +230,4 @@ function clip(s::BoxSpace, u) clamp.(u, s.low, s.high) end - - +Random.rand(rng::Random.AbstractRNG, s::BoxSpace{T,N}) where {T,N} = return rand(rng, T, N) .* (s.high .- s.low) .+ s.low diff --git a/environments/rlenv.jl b/environments/rlenv.jl new file mode 100644 index 000000000..3baf47d12 --- /dev/null +++ b/environments/rlenv.jl @@ -0,0 +1,46 @@ +using ReinforcementLearningBase: RLBase + +mutable struct DojoRLEnv{T} <: RLBase.AbstractEnv + dojoenv::Environment + state::Vector{T} + reward::T + done::Bool + info::Dict +end + +function DojoRLEnv(dojoenv::Environment{X,T}) where {X,T} + state = reset(dojoenv) + return DojoRLEnv{T}(dojoenv, state, convert(T, 0.0), false, Dict()) +end + +function DojoRLEnv(name::String; kwargs...) + DojoRLEnv(Dojo.get_environment(name; kwargs...)) +end + +function Base.convert(::Type{RLBase.Space}, s::BoxSpace) + RLBase.Space([BoxSpace(1; low = s.low[i:i], high = s.high[i:i]) for i in 1:s.n]) +end + +RLBase.action_space(env::DojoRLEnv) = convert(RLBase.Space, env.dojoenv.input_space) +RLBase.state_space(env::DojoRLEnv) = convert(RLBase.Space, env.dojoenv.observation_space) +RLBase.is_terminated(env::DojoRLEnv) = env.done + +RLBase.reset!(env::DojoRLEnv) = reset(env.dojoenv) + +RLBase.reward(env::DojoRLEnv) = env.reward +RLBase.state(env::DojoRLEnv) = env.state + +Random.seed!(env::DojoRLEnv, seed) = Dojo.seed(env.dojoenv, seed) + +# TODO: +# RLBase.ChanceStyle(env::DojoRLEnv) = RLBase.DETERMINISTIC + +function (env::DojoRLEnv)(a) + s, r, d, i = step(env.dojoenv, a) + env.state .= s + env.reward = r + env.done = d + env.info = i + return nothing +end +(env::DojoRLEnv)(a::Number) = env([a]) diff --git a/examples/deeprl/Project.toml b/examples/deeprl/Project.toml new file mode 100644 index 000000000..81e370e7a --- /dev/null +++ b/examples/deeprl/Project.toml @@ -0,0 +1,6 @@ +[deps] +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Dojo = "ac60b53e-8d92-4c83-b960-e78698fa1916" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +ReinforcementLearning = "158674fc-8238-5cab-b5ba-03dfc80d1318" diff --git a/examples/deeprl/ant_ddpg.jl b/examples/deeprl/ant_ddpg.jl new file mode 100644 index 000000000..e71d2844c --- /dev/null +++ b/examples/deeprl/ant_ddpg.jl @@ -0,0 +1,80 @@ +using ReinforcementLearning +using Flux +using Flux.Losses + +using Random +using Dojo + +function RL.Experiment( + ::Val{:JuliaRL}, + ::Val{:DDPG}, + ::Val{:DojoAnt}, + ::Nothing, + save_dir = nothing, + seed = 42 +) + + rng = MersenneTwister(seed) + env = Dojo.DojoRLEnv("ant") + Random.seed!(env, seed) + A = action_space(env) + ns, na = length(state(env)), length(action_space(env)) + @show na + + init = glorot_uniform(rng) + + create_actor() = Chain( + Dense(ns, 30, relu; init = init), + Dense(30, 30, relu; init = init), + Dense(30, na, tanh; init = init), + ) + create_critic() = Chain( + Dense(ns + na, 30, relu; init = init), + Dense(30, 30, relu; init = init), + Dense(30, 1; init = init), + ) + + agent = Agent( + policy = DDPGPolicy( + behavior_actor = NeuralNetworkApproximator( + model = create_actor(), + optimizer = ADAM(), + ), + behavior_critic = NeuralNetworkApproximator( + model = create_critic(), + optimizer = ADAM(), + ), + target_actor = NeuralNetworkApproximator( + model = create_actor(), + optimizer = ADAM(), + ), + target_critic = NeuralNetworkApproximator( + model = create_critic(), + optimizer = ADAM(), + ), + γ = 0.99f0, + ρ = 0.995f0, + na = na, + batch_size = 64, + start_steps = 1000, + start_policy = RandomPolicy(A; rng = rng), + update_after = 1000, + update_freq = 1, + act_limit = 1.0, + act_noise = 0.1, + rng = rng, + ), + trajectory = CircularArraySARTTrajectory( + capacity = 10000, + state = Vector{Float32} => (ns,), + action = Float32 => (na, ), + ), + ) + + stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI")) + hook = TotalRewardPerEpisode() + Experiment(agent, env, stop_condition, hook, "# Dojo Ant with DDPG") +end + +ex = E`JuliaRL_DDPG_DojoAnt` +run(ex) \ No newline at end of file diff --git a/examples/deeprl/ant_ppo.jl b/examples/deeprl/ant_ppo.jl new file mode 100644 index 000000000..ddb06ebf2 --- /dev/null +++ b/examples/deeprl/ant_ppo.jl @@ -0,0 +1,73 @@ +using ReinforcementLearning +using Flux +using Flux.Losses + +using Random +using Distributions +using Dojo + +function RL.Experiment( + ::Val{:JuliaRL}, + ::Val{:PPO}, + ::Val{:DojoAnt}, + ::Nothing, + save_dir = nothing, + seed = 42 +) + rng = MersenneTwister(seed) + N_ENV = 6 + UPDATE_FREQ = 32 + env_vec = [Dojo.DojoRLEnv("ant") for i in 1:N_ENV] + for i in 1:N_ENV + Random.seed!(env_vec[i], hash(seed+i)) + end + env = MultiThreadEnv(env_vec) + + ns, na = length(state(env[1])), length(action_space(env[1])) + RLBase.reset!(env; is_force=true) + + agent = Agent( + policy = PPOPolicy( + approximator = ActorCritic( + actor = GaussianNetwork( + pre = Chain( + Dense(ns, 64, relu; init = glorot_uniform(rng)), + Dense(64, 64, relu; init = glorot_uniform(rng)), + ), + μ = Chain(Dense(64, na, tanh; init = glorot_uniform(rng)), vec), + logσ = Chain(Dense(64, na; init = glorot_uniform(rng)), vec), + ), + critic = Chain( + Dense(ns, 256, relu; init = glorot_uniform(rng)), + Dense(256, na; init = glorot_uniform(rng)), + ), + optimizer = ADAM(1e-3), + ), + γ = 0.99f0, + λ = 0.95f0, + clip_range = 0.1f0, + max_grad_norm = 0.5f0, + n_epochs = 4, + n_microbatches = 4, + actor_loss_weight = 1.0f0, + critic_loss_weight = 0.5f0, + entropy_loss_weight = 0.001f0, + dist = Normal, + update_freq = UPDATE_FREQ, + ), + trajectory = PPOTrajectory(; + capacity = UPDATE_FREQ, + state = Matrix{Float32} => (ns, N_ENV), + action = Matrix{Float32} => (na, N_ENV), + action_log_prob = Vector{Float32} => (N_ENV,), + reward = Vector{Float32} => (N_ENV,), + terminal = Vector{Bool} => (N_ENV,), + ), + ) + stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI")) + hook = TotalBatchRewardPerEpisode(N_ENV) + Experiment(agent, env, stop_condition, hook, "# PPO with Dojo Ant") +end + +ex = E`JuliaRL_PPO_DojoAnt` +run(ex) \ No newline at end of file diff --git a/examples/deeprl/cartpole_ddpg.jl b/examples/deeprl/cartpole_ddpg.jl new file mode 100644 index 000000000..4cf3f1a1a --- /dev/null +++ b/examples/deeprl/cartpole_ddpg.jl @@ -0,0 +1,88 @@ +using ReinforcementLearning +using Flux +using Flux.Losses + +using Random +using Dojo + +function RL.Experiment( + ::Val{:JuliaRL}, + ::Val{:DDPG}, + ::Val{:DojoCartpole}, + ::Nothing, + save_dir = nothing, + seed = 42 +) + + rng = MersenneTwister(seed) + inner_env = Dojo.DojoRLEnv("cartpole") + Random.seed!(inner_env, seed) + # TODO + low = -5.0 + high = 5.0 + ns, na = length(state(inner_env)), length(action_space(inner_env)) + @show na + A = Dojo.BoxSpace(na) + env = ActionTransformedEnv( + inner_env; + action_mapping = x -> low .+ (x .+ 1) .* 0.5 .* (high .- low), + action_space_mapping = _ -> A + ) + + init = glorot_uniform(rng) + + create_actor() = Chain( + Dense(ns, 30, relu; init = init), + Dense(30, 30, relu; init = init), + Dense(30, na, tanh; init = init), + ) + create_critic() = Chain( + Dense(ns + na, 30, relu; init = init), + Dense(30, 30, relu; init = init), + Dense(30, 1; init = init), + ) + + agent = Agent( + policy = DDPGPolicy( + behavior_actor = NeuralNetworkApproximator( + model = create_actor(), + optimizer = ADAM(), + ), + behavior_critic = NeuralNetworkApproximator( + model = create_critic(), + optimizer = ADAM(), + ), + target_actor = NeuralNetworkApproximator( + model = create_actor(), + optimizer = ADAM(), + ), + target_critic = NeuralNetworkApproximator( + model = create_critic(), + optimizer = ADAM(), + ), + γ = 0.99f0, + ρ = 0.995f0, + na = na, + batch_size = 64, + start_steps = 1000, + start_policy = RandomPolicy(A; rng = rng), + update_after = 1000, + update_freq = 1, + act_limit = 1.0, + act_noise = 0.1, + rng = rng, + ), + trajectory = CircularArraySARTTrajectory( + capacity = 10000, + state = Vector{Float32} => (ns,), + action = Float32 => (na, ), + ), + ) + + stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI")) + hook = TotalRewardPerEpisode() + Experiment(agent, env, stop_condition, hook, "# Dojo Cartpole with DDPG") +end + +ex = E`JuliaRL_DDPG_DojoCartpole` +run(ex) \ No newline at end of file diff --git a/examples/deeprl/cartpole_ppo.jl b/examples/deeprl/cartpole_ppo.jl new file mode 100644 index 000000000..659d1e7b8 --- /dev/null +++ b/examples/deeprl/cartpole_ppo.jl @@ -0,0 +1,74 @@ +using ReinforcementLearning +using Flux +using Flux.Losses + +using Random +using Distributions +using Dojo + +function RL.Experiment( + ::Val{:JuliaRL}, + ::Val{:PPO}, + ::Val{:DojoCartpole}, + ::Nothing, + save_dir = nothing, + seed = 42 +) + rng = MersenneTwister(seed) + N_ENV = 6 + UPDATE_FREQ = 32 + env_vec = [Dojo.DojoRLEnv("cartpole") for i in 1:N_ENV] + for i in 1:N_ENV + Random.seed!(env_vec[i], hash(seed+i)) + end + env = MultiThreadEnv(env_vec) + + ns, na = length(state(env[1])), length(action_space(env[1])) + RLBase.reset!(env; is_force=true) + + agent = Agent( + policy = PPOPolicy( + approximator = ActorCritic( + actor = GaussianNetwork( + pre = Chain( + Dense(ns, 64, relu; init = glorot_uniform(rng)), + Dense(64, 64, relu; init = glorot_uniform(rng)), + ), + μ = Chain(Dense(64, na, tanh; init = glorot_uniform(rng)), vec), + logσ = Chain(Dense(64, na; init = glorot_uniform(rng)), vec), + ), + critic = Chain( + Dense(ns, 64, relu; init = glorot_uniform(rng)), + Dense(64, 64, relu; init = glorot_uniform(rng)), + Dense(64, 1; init = glorot_uniform(rng)), + ), + optimizer = ADAM(1e-3), + ), + γ = 0.99f0, + λ = 0.95f0, + clip_range = 0.2f0, + max_grad_norm = 0.5f0, + n_epochs = 10, + n_microbatches = 32, + actor_loss_weight = 1.0f0, + critic_loss_weight = 0.5f0, + entropy_loss_weight = 0.001f0, + dist = Normal, + update_freq = UPDATE_FREQ, + ), + trajectory = PPOTrajectory(; + capacity = UPDATE_FREQ, + state = Matrix{Float32} => (ns, N_ENV), + action = Matrix{Float32} => (na, N_ENV), + action_log_prob = Vector{Float32} => (N_ENV,), + reward = Vector{Float32} => (N_ENV,), + terminal = Vector{Bool} => (N_ENV,), + ), + ) + stop_condition = StopAfterStep(10_000, is_show_progress=!haskey(ENV, "CI")) + hook = TotalBatchRewardPerEpisode(N_ENV) + Experiment(agent, env, stop_condition, hook, "# PPO with Dojo Cartpole") +end + +ex = E`JuliaRL_PPO_DojoCartpole` +run(ex) \ No newline at end of file diff --git a/src/Dojo.jl b/src/Dojo.jl index 4da0c62c0..778c15452 100644 --- a/src/Dojo.jl +++ b/src/Dojo.jl @@ -145,6 +145,7 @@ include(joinpath("..", "environments", "environment.jl")) include(joinpath("..", "environments", "dynamics.jl")) include(joinpath("..", "environments", "utilities.jl")) include(joinpath("..", "environments", "include.jl")) +include(joinpath("..", "environments", "rlenv.jl")) # Bodies export