# Simulation of the data:

logitinv <- function(x) {1/(1+exp(-x))};
n <- 1000;
myData <- data.frame(x1 = 2*runif(n)-1, x2 = 2*runif(n)^(1/2)-1);
#param <- 3*c(-1,5); myData$y <- sign(-runif(n)+logitinv(param[1]+param[2]*(-myData$x1^2 + myData$x2)));
param <- 6*c(-1,3); myData$y <- sign(-runif(n)+logitinv(param[1]+param[2]*(myData$x1^2 + myData$x2^2)));
plot(myData$x1, myData$x2, col= c("blue", "red")[(myData$y+3)/2], pch=19);
print(paste("proportion of label y=+1:", mean(myData$y==1)))



# helper functions:

entropyPurity <- function(w, s){
  #in: w = total weight
  #    s = weighted sum of labels
  p <- max(1e-10, min(1-1e-10, (s+w)/(2*w)));
  return(1 + p*log2(p) + (1-p)*log2(1-p));
}

errorPurity <- function(w,s){
  p <- max(1e-10, min(1-1e-10, (s+w)/(2*w)));
  return(1-2*min(p, 1-p));
}

findBestCut <- function(y, w, purity=entropyPurity){
  #in: y = list of labels,
  #    w = weigths of the points,
  #    purity = purity function to be optimized for the cut,
  #out: res$cut = index of the best cut in terms of the 0-1 valued purity function specified
  #     res$margins = weighted sums of the labels before, and after the cut
  
  n <- length(y);
  wL <- w[1];
  wR <- sum(w[-1]);
  sL <- w[1]*y[1];
  sR <- sum(w[-1]*y[-1]);
  bestPurity <- wL + wR*purity(wR, sR);
  res <- list(cut = 1, margins <- c(sL, sR), visu <- array(bestPurity, n-1));
  for (t in 2:(n-1)){
    wL <- wL + w[t];
    wR <- wR - w[t];
    sL <- sL + w[t]*y[t];
    sR <- sR - w[t]*y[t];
    curPurity <- wL * purity(wL, sL) + wR * purity(wR, sR);
    res$visu[t] <- curPurity;
    if (curPurity > bestPurity){
      bestPurity <- curPurity;
      res$margins <- c(sL, sR);
      res$cut <- t;
    }
  }
  return(res);
}

##  test:
sigma1 <- order(myData$x1);
x <- myData$x1[sigma1];
y <- myData$y[sigma1];
res <- findBestCut(y, array(1/n, n), entropyPurity)
plot(res$visu)

sigma2 <- order(myData$x2);
x <- myData$x2[sigma2];
y <- myData$y[sigma2];
res <- findBestCut(y, array(1/n, n), entropyPurity)
plot(res$visu)


# boosting with best stump in random direction

w <- array(1/n,n)
T <- 1000;

stumps <- array(0, c(T, 4)) # stumps[t, ] = (direction, value of cut, label of <=cut, label for >cut) 
epsilon <- array(0, T)
alpha <- array(0, T)
yh <- array(0, c(T, n))
testError <- array(0, T)

sigma <- array(0, c(n, 2)); #sigma[,j] = permutation sorting x_j
sx <- array(0, c(n, 2));
sy <- array(0, c(n, 2));
for(j in 1:2){
  x <- if (j==1) {myData$x1} else {myData$x2;};
  sigma[,j] <- order(x);
  sx[,j] <- x[sigma[,j]];
  sy[,j] <- myData$y[sigma[,j]];
}


stump.predict <- function(stump, x){
  if (x[stump[1]] <= stump[2]){
    pred <- stump[3];
  }
  else{
    pred <- stump[4];
  }
  return(pred);  
}


# main loop

for(t in 1:T){
  
  #readline("Press <return to continue") 
  
  dir <- 1+(runif(1)<0.5)
  stumps[t, 1] <- dir;
  res <- findBestCut(sy[,dir], w[sigma[,dir]], entropyPurity)
  stumps[t, 2] <- sx[res$cut, dir];
  stumps[t, 3:4] <- sign(res$margins);
    
  for (k in 1:n){ yh[t,k] <- stump.predict(stumps[t,], c(myData$x1[k], myData$x2[k]));}
  epsilon[t] <- sum(w * as.numeric(myData$y != yh[t,]));
  alpha[t] <- 1/2*log((1-epsilon[t])/epsilon[t]);
  
  w <- w * exp(-alpha[t] * myData$y * yh[t,]); 
  w <- w / sum(w);
  
  
  # visualize aggregate predictor
  
  AggregatePredictor <- function(x1,x2){
    tot <- 0;
    for (s in 1:t){
      tot = tot + alpha[s] * stump.predict(stumps[s,], c(x1, x2));
    }
    return(sign(tot));
  }
  
  if(t<20){
    
    tt <- seq(-1, 1, 0.05);
    Z <- outer(tt, tt, function(x,y) x+1i*y);
    totalPred <- outer(tt, tt, Vectorize(AggregatePredictor))
    
    plot(Re(Z), Im(Z), col=c("cyan", "lightpink")[(totalPred+3)/2], pch=19, cex=2)
    points(myData$x1, myData$x2, col= c("blue", "red")[(myData$y+3)/2], pch=19, cex=w*1.5/max(w));
    
    
    # visualize rules of thumb
    
    if(t<T){
      for (s in 1:t){
        if (stumps[s,1]==1){
          lines(stumps[s,2]*c(1,1), c(-1,1), col="green", type="l", lwd=3);
        }
        else{
          lines(c(-1, 1), stumps[s,2]*c(1,1), col="green", type="l", lwd=3);
        }
      }
    }
    # compute test error
    
    p <- alpha[1] * yh[1,];
    for (s in 2:t){
      p <- p + alpha[s] * yh[s,];
    }
    yhAgg <- sign(p);
    
    testError[t] <- mean(yhAgg != myData$y)
    print(paste("round", t, ": epsilon=", epsilon[t], "alpha=", alpha[t], "total test error: ",  testError[t]))
  #  print(paste("weight of labels y=+1: ", sum(w[myData$y==1])))
    readline("Press <return to continue");
  }
  else {print(t);}
}

tt <- seq(-1, 1, 0.02);
Z <- outer(tt, tt, function(x,y) x+1i*y);
totalPred <- outer(tt, tt, Vectorize(AggregatePredictor))

plot(Re(Z), Im(Z), col=c("cyan", "lightpink")[(totalPred+3)/2], pch=19, cex=1)
points(myData$x1, myData$x2, col= c("blue", "red")[(myData$y+3)/2], pch=19, cex=w*1.5/max(w));


# print results:

p <- alpha[1] * yh[1,];
for (s in 2:t){
  p <- p + alpha[s] * yh[s,];
}
yhAgg <- sign(p);

testError[t] <- mean(yhAgg != myData$y)
print(paste("round", t, ": epsilon=", epsilon[t], "alpha=", alpha[t], "total test error: ",  testError[t]))


# 
# ## boosting with linear regresssion => failure!
# w <- array(1,n)/n
# 
# T <- 10
# 
# coeffs <- array(0, c(T, 3))
# epsilon <- array(0, T)
# alpha <- array(0, T)
# yh <- array(0, c(T, n))
# testError <- array(0, T)
# 
# for(t in 1:T){
# 
#   #readline("Press <return to continue") 
#   linearPred <- lm(y~x1+x2, data=myData%2c.html weights = w);
# 
#   yh[t,] <- sign(predict(linearPred, myData));
#   epsilon[t] <- sum(w * as.numeric(myData$y != yh[t,]));
#   alpha[t] <- 1/2*log((1-epsilon[t])/epsilon[t]);
#   
#   w <- w * exp(-alpha[t] * myData$y * yh[t,]); 
#   w <- w / sum(w);
#   
#   coeffs[t,] <- coef(linearPred)
#   
#   #vizualize aggregate predictor
#   
#   AggregatePredictor <- function(x,y){
#     tot <- 0;
#     for (s in 1:t){
#       tot = tot + alpha[t] * sign(coeffs[s, 1] + coeffs[s, 2]*x+coeffs[s, 3]*y);
#     }
#     return(sign(tot));
#   }
#   
#   tt <- seq(-1, 1, 0.05);
#   Z <- outer(tt, tt, function(x,y) x+1i*y);
#   totalPred <- outer(tt, tt, AggregatePredictor)
#   
#   plot(Re(Z), Im(Z), col=c("cyan", "magenta")[(totalPred+3)/2], pch=19)
#   points(myData$x1, myData$x2, col= c("blue", "red")[(myData$y+3)/2], pch=19, cex=w*1.5/max(w));
#   
#   
#   # vizualize rules of thumb
#   
#   xx <- c(-1, 1);
#   for (s in 1:t){
#     lines(xx, -(coeffs[s, 1]+coeffs[s, 2]*xx)/coeffs[s, 3], col="green", type="l", lwd=3);
#   }
#   
#   # compute test error
#   
#   p <- alpha[1] * yh[1,];
#   for (s in 2:t){
#     p <- p + alpha[s] * yh[s,];
#   }
#   yhAgg <- sign(p);
#   
#   testError[t] <- mean(yhAgg != myData$y)
#   print(paste("round", t, ": epsilon=", epsilon[t], "alpha=", alpha[t], "total test error: ",  testError[t]))
#   print(paste("weight of labels y=+1: ", sum(w[myData$y==1])))
# 
#   
#   readline("Press <return to continue") 
# }
# 
#