# creation of the sample

alpha <- 0.2; beta <- 0.8; # the following assumes alpha < 0.5 < beta

rexemple <- function(n){
	res <- list();
	res$x <- cbind(runif(n), runif(n));
	res$y <- array(n);
	u <- runif(n);
	T = 2 * res$x[,1] + res$x[,2] < 1.5;
	res$y[T] <- u[T]<alpha;
	res$y[!T] <- u[!T]<beta;
	res;
}

evaluateClassifError <- function(method){
	N <- 10000;
	test <- rexemple(N);
	err <- 0;
	for(i in 1:N){
		err = err + (test$y[i] != method(test$x[i,]));
	}
	err/N;
}

n <- 100;
train <- rexemple(n);

bayesClassifier <- function(x) {2*x[1] + x[2] > 1.5;}
bayesError <- evaluateClassifError(bayesClassifier)

# method 2: k-nn
require(class)
test <- rexemple(50000); # could also use LHS here
test$guessed <- knn(train$x, test$x, train$y, k=1)
knnError <- mean(test$y != test$guessed)
plot(test$x[,1], test$x[,2], col = c("pink", "cyan")[as.numeric(test$guessed)], pch = 19, xlim = c(-0.2, 1.2), ylim = c(-0.2, 1.2))
points(train$x[,1], train$x[,2], col = c("red", "blue")[1+train$y], pch = 19)
lines(c(0, 1), c(1.5, -0.5), col="black")

# graph legend and result prints
title("simulated data classification")
legend(0.8, 1.2, c("true frontier", "k-nn zone 1", "k-nn zone 0"), col = c("black", "cyan", "pink"),  lty=1);
cat("Bayes classification error:\t", bayesError, "\nknn classifier error:\t", knnError, "\n")


# choice of k by crossvalidation
K = 10; L = floor(n/K);
Ik = 1:(n-L-1);
err <- array(0, length(Ik));
for(i in 1:K){
  for(k in 1:length(Ik)){
    plageTest = (i-1) * L + (1:L);
    rtrain = list(x=train$x[-plageTest, ], y=train$y[-plageTest]);
    test = list(x=train$x[plageTest, ], y=train$y[plageTest]);
    test$guessed <- knn(rtrain$x, test$x, rtrain$y, k=Ik[k])
    err[k] <- err[k] + mean(test$y != test$guessed)/length(Ik)
  }
}

x11()
plot(Ik, err, type='l');
kopt = Ik[which.min(err)];
kopt