rm(list=ls()) M <- 15 gamma <- 0.99; q <- 0.1; pD <- q*(1-q)^(0:(M-1)); pD[1+M] <- 1 - sum(pD); rdemand <- function(p){ sum(runif(1)>cumsum(p)); } reward <- function(x, a, d){ K <- 0.8 # frais de transport c <- 0.5 # cout a l'unite h <- 0.3 # cout de stockage unitaire p <- 1 # prix de vente -K * (a>0) -c*max(0, min(x+a, M)-x) - h*x + p * min(d, x+a, M); } nextState <- function(x, a, d){ max(0, min(x+a, M)-d); } simu <- function(n, pi){ R <- array(0,n); X <- M; for (t in 1:n){ D <- rdemand(pD); R[t] <- reward(X, pi[X+1], D); X <- nextState(X, pi[X+1], D); } R; } pi = array(2, M+1); # on commande toujours 2 machines n <- 200; R <- simu(n, pi); V <- cumsum(R * gamma^(0:(n-1))); plot(0:n, c(0,V), type='l') trans <- list() rew <- list() for (a in 0:M){ P <- matrix(0, ncol = 1+M, nrow = 1+M); r <- vector(mode='numeric', length=1+M) for (x in 0:M) for (d in 0:M){ P[1+x, 1+nextState(x, a, d)] <- P[1+x, 1+nextState(x, a, d)] + pD[1+d]; r[1+x] <- r[1+x] + pD[1+d] * reward(x, a, d); } trans[[1+a]] <- P; rew[[1+a]] <- r; } policyValue <- function(pol){ P <- matrix(0, ncol=1+M, nrow=1+M); r <- as.vector(rep(0, 1+M)); for (x in 0:M){ a <- pol[1+x]; P[1+x,] <- trans[[1+a]][1+x,]; r[1+x] <- rew[[1+a]][1+x]; } solve(diag(1+M)-gamma*P, r); } print(policyValue(pi)); BellmanOperator <- function(V){ res = list() res$V <- V; res$pol <- rep(0, 1+M); for (x in 0:M){ Q <- rep(0, 1+M); for (a in 0:M){ Q[1+a] <- rew[[1+a]][1+x] + gamma * trans[[1+a]][1+x,] %*% V; } res$V[1+x] <- max(Q); res$pol[1+x] <- -1+which.max(Q); } res } valueIteration <- function(){ res = list(); res$V <- as.vector(rep(0, 1+M)); res$pol <- rep(0, 1+M); oV <- res$V+1; while (max(abs(oV-res$V))>1e-4){ oV <- res$V; res <- BellmanOperator(res$V); } res } sol <- valueIteration(); print(sol$V); n <- 5/(1-gamma); plot(c(0, n), sol$V[1+M]*c(1,1), xlim=c(0,n), ylim=c(0, 1.5*sol$V[1+M]), type='l', col='red', lwd=3) for(k in 1:50){ R <- simu(n, sol$pol); V <- cumsum(R * gamma^(0:(n-1))); lines(0:n, c(0,V), type='l') } policyIteration <- function(){ res <- list(); res$V <- as.vector(rep(0, 1+M)); res$pol <- floor((1+M)*runif(1+M)); opol <- res$pol+1; while (any(opol != res$pol)){ opol <- res$pol; res <- BellmanOperator(policyValue(opol)); } res } Qlearning <- function(n){ Q <- matrix(0, nrow = 1+M, ncol = 1+M) epsilon <- 0.1 # epsilon-greedy policy X <- M # on part charge for (t in 1:n){ A <- -1+which.max(Q[X,]) if (runif(1)