corail-research · 3rdCore · May 5, 2022 · Feb 18, 2022 · Feb 18, 2022 · Feb 18, 2022
diff --git a/src/CP/core/model.jl b/src/CP/core/model.jl
@@ -10,7 +10,8 @@ mutable struct Statistics
     numberOfSolutionsBeforeRestart          ::Int
     numberOfInfeasibleSolutionsBeforeRestart::Int
     numberOfNodesBeforeRestart              ::Int
-    AccumulatedRewardBeforeReset            ::Float32
+    AccumulatedRewardBeforeReset            ::Float32 # =last_episode_total_reward(lh.agent.trajectory)
+    AccumulatedRewardBeforeRestart          ::Float32
     solutions                               ::Vector{Union{Nothing,Solution}}
     nodevisitedpersolution                  ::Vector{Int}
     objectives                              ::Union{Nothing, Vector{Union{Nothing,Int}}}
@@ -47,7 +48,7 @@ mutable struct CPModel
     limit                   ::Limit
     knownObjective          ::Union{Nothing,Int64}
     adhocInfo               ::Any
-    CPModel(trailer) = new(Dict{String, AbstractVar}(), Dict{String, Bool}(), Constraint[], trailer, nothing, nothing, Statistics(Dict{String, Int}(), 0,0, 0, 0, 0, 0, 0, Solution[],Int[], nothing, nothing, nothing), Limit(nothing, nothing, nothing), nothing)
+    CPModel(trailer) = new(Dict{String, AbstractVar}(), Dict{String, Bool}(), Constraint[], trailer, nothing, nothing, Statistics(Dict{String, Int}(), 0,0, 0, 0, 0, 0, 0, 0, Solution[],Int[], nothing, nothing, nothing), Limit(nothing, nothing, nothing), nothing)
 end
 
 CPModel() = CPModel(Trailer())
@@ -223,6 +224,7 @@ function Base.isempty(model::CPModel)::Bool
         && model.statistics.numberOfSolutionsBeforeRestart == 0
         && model.statistics.numberOfNodesBeforeRestart == 0
         && model.statistics.AccumulatedRewardBeforeReset == 0
+        && model.statistics.AccumulatedRewardBeforeRestart == 0
         && isnothing(model.limit.numberOfNodes)
         && isnothing(model.limit.numberOfSolutions)
         && isnothing(model.limit.searchingTime)
@@ -255,6 +257,7 @@ function Base.empty!(model::CPModel)
     model.statistics.numberOfSolutionsBeforeRestart = 0
     model.statistics.numberOfNodesBeforeRestart = 0
     model.statistics.AccumulatedRewardBeforeReset = 0
+    model.statistics.AccumulatedRewardBeforeRestart = 0
     model.limit.numberOfNodes = nothing
     model.limit.numberOfSolutions = nothing
     model.limit.searchingTime = nothing
@@ -291,9 +294,8 @@ function reset_model!(model::CPModel)
     model.statistics.numberOfSolutionsBeforeRestart = 0
     model.statistics.numberOfNodesBeforeRestart = 0
     model.statistics.AccumulatedRewardBeforeReset = 0
-
+    model.statistics.AccumulatedRewardBeforeRestart = 0  
 end
-
 """
 restart_search!(model::CPModel)
 
@@ -307,6 +309,8 @@ function restart_search!(model::CPModel)
     model.statistics.numberOfInfeasibleSolutionsBeforeRestart = 0
     model.statistics.numberOfSolutionsBeforeRestart = 0
     model.statistics.numberOfNodesBeforeRestart = 0
+    model.statistics.AccumulatedRewardBeforeRestart = 0
+
 end
 
 """

diff --git a/src/CP/core/search/ilds.jl b/src/CP/core/search/ilds.jl
@@ -13,7 +13,7 @@ function initroot!(toCall::Stack{Function}, search::ILDSearch , model::CPModel,
     for k in search.d:-1:1
         push!(toCall, (model) -> (restart_search!(model); expandIlds!(toCall,k, model, variableHeuristic, valueSelection)))
     end
-    return expandIlds!(toCall, 0, model, variableHeuristic, valueSelection,nothing)
+    return expandIlds!(toCall, 0, model, variableHeuristic, valueSelection, nothing)
 end
 
 """

diff --git a/src/CP/valueselection/learning/learnedheuristic.jl b/src/CP/valueselection/learning/learnedheuristic.jl
@@ -14,7 +14,7 @@ and your functions will be called instead of the default ones.
 abstract type AbstractReward end
 
 """
-    LearnedHeuristic{SR<:AbstractStateRepresentation, R<:AbstractReward, A<:ActionOutput}
+    mutable struct LearnedHeuristic{SR<:AbstractStateRepresentation, R<:AbstractReward, A<:ActionOutput}
 
 The LearnedHeuristic is a value selection heuristic which is learned thanks to a training made by solving a certain amount
 of problem instances from files or from an `AbstractModelGenerator`. From the RL point of view, this is an agent which is

diff --git a/src/CP/valueselection/learning/rewards/smartreward.jl b/src/CP/valueselection/learning/rewards/smartreward.jl
@@ -2,11 +2,15 @@
     struct SmartReward <: AbstractReward end
 
 This is the smart reward, that will be used to teach the agent to prioritize paths that lead to improving solutions.
+This reward is the exact reward implemented by Quentin Cappart in
+his recent paper: Combining RL & CP for Combinatorial Optimization, https://arxiv.org/pdf/2006.01610.pdf.
 """
 mutable struct SmartReward <: AbstractReward 
     value::Float32
 end
 
+ρ = 0.001
+
 SmartReward(model::CPModel) = SmartReward(0)
 
 """
@@ -19,15 +23,18 @@ function set_reward!(::Type{StepPhase}, lh::LearnedHeuristic{SR, SmartReward, A}
     A <: ActionOutput
 }
     if symbol == :Infeasible  
-        #println("INFEASIBLE")
-        #lh.reward.value -= last_episode_total_reward(lh.agent.trajectory)
         lh.reward.value -= 0
-
-    elseif symbol == :FoundSolution
-        #println("SOLUTION FOUND, score : ",assignedValue(model.objective), " delta : ",15-assignedValue(model.objective)," accumulated reward :  ", model.statistics.AccumulatedRewardBeforeReset)
-        lh.reward.value += isnothing(model.objective) ?  0 : 100 * (-assignedValue(model.objective))
-        #lh.reward.value += model.statistics.lastPruning
-
+    elseif symbol == :FoundSolution #last portion required to get the full closed path
+        dist = model.adhocInfo[1]
+        n =  size(dist)[1]
+        max_dist = Float32(Base.maximum(dist))
+            if isbound(model.variables["a_"*string(n-1)])
+                last = assignedValue(model.variables["a_"*string(n-1)])
+                first = assignedValue(model.variables["a_1"])
+
+                dist_to_first_node = lh.current_state.dist[last, first] * max_dist
+                lh.reward.value += -ρ*dist_to_first_node 
+            end
     elseif symbol == :Feasible 
         lh.reward.value -= 0
     elseif symbol == :BackTracking
@@ -38,16 +45,34 @@ end
 """
     set_reward!(::DecisionPhase, lh::LearnedHeuristic{SmartReward, O}, model::CPModel)
 
-Change the current reward at the DecisionPhase. This is called right before making the next decision, so you know you have the very last state before the new decision
-and every computation like fixPoints and backtracking has been done.
+Change the current reward at the DecisionPhase. This is called right before making the next decision, so you know you have the very last state before the new decision and every computation like fixPoints and backtracking has been done.
+
+This computes the reward : ρ*( 1+ tour_upper_bound  - last_dist) where ρ is a constant, tour_upper_bound and upper bound of the tour and lastdist the distance between the previous node and the target node decided by the previous decision (the reward is attributed just before takng a new decision)
 """
 function set_reward!(::Type{DecisionPhase}, lh::LearnedHeuristic{SR, SmartReward, A}, model::CPModel) where {
     SR <: AbstractStateRepresentation,
     A <: ActionOutput
 }
-    #println("Decision, reward : ",model.statistics.lastPruning)
+    dist = model.adhocInfo[1]
+    n =  size(dist)[1]
+
+    tour_upper_bound = Base.maximum(dist) * n
+    max_dist = Float32(Base.maximum(dist))
+
+    if !isnothing(model.statistics.lastVar)
+        x = model.statistics.lastVar
+        s = x.id
+        current = parse(Int,split(x.id,'_')[2])
+        if isbound(model.variables["a_"*string(current)])
+            a_i = assignedValue(model.variables["a_"*string(current)])
+            v_i = assignedValue(model.variables["v_"*string(current)])
+            last_dist = lh.current_state.dist[v_i, a_i] * max_dist
+            #print("last_dist : ", last_dist, " // ")
+            lh.reward.value += ρ*( 1+ tour_upper_bound  - last_dist)
+        end
+
+    end
 
-    #lh.reward.value += model.statistics.lastPruning
 
 end
 

diff --git a/src/CP/valueselection/learning/utils.jl b/src/CP/valueselection/learning/utils.jl
@@ -9,6 +9,7 @@ manually change the mode again if he wants.
 """
 function Flux.testmode!(lh::LearnedHeuristic, mode = true)
     Flux.testmode!(lh.agent, mode)
+    lh.agent.policy.explorer.is_training = !mode
     lh.trainMode = !mode
 end
 
@@ -79,6 +80,8 @@ function get_observation!(lh::LearnedHeuristic, model::CPModel, x::AbstractIntVa
 
     # Initialize reward for the next state: not compulsory with DefaultReward, but maybe useful in case the user forgets it
     model.statistics.AccumulatedRewardBeforeReset += lh.reward.value
+    model.statistics.AccumulatedRewardBeforeRestart += lh.reward.value
+
     lh.reward.value = 0
 
     # synchronize state: 

diff --git a/src/RL/utils/totalreward.jl b/src/RL/utils/totalreward.jl
@@ -1,13 +1,15 @@
 """
     last_episode_total_reward(t::AbstractTrajectory)
 
-Compute the sum of every reward of the last episode of the trajectory
+Compute the sum of every reward of the last episode of the trajectory. 
+
+For example, if the t[:terminal] = [0, 0, 1, 0, 1, 1, 1, 0, 0, 1], The 7-th state is a terminal state, which means that the last episode started at step 8. Hence, last_episode_total_reward corresponds to the 3 lasts decisions.
 """
 function last_episode_total_reward(t::AbstractTrajectory)
     last_index = length(t[:terminal])
+    last_index == 0 && return 0
 
-    #if t[:terminal][last_index]   #TODO understand why they wrote this
-
+    #if t[:terminal][last_index]  Do we need to consider cases where the last state is not a terminal state ?
     totalReward = t[:reward][last_index]
 
     i = 1
@@ -18,3 +20,4 @@ function last_episode_total_reward(t::AbstractTrajectory)
     end
     return totalReward
 end
+