## initialisation #setwd("~/ownCloud/DataMining/Data/") #setwd("d:/ownCloud/dataMining/data") setwd("C:/Users/admin/ownCloud/Datamining/Data") #setwd("C:/Users/agarivie/owncloud/DataMining/Data"); ## chargement des données cClasses = c("numeric", "factor", "numeric", "factor", "numeric", "factor", "factor", "factor", "factor", "character", "numeric", "numeric", "numeric", "factor", "factor"); cNames <- c("age", "workclass", "fnlwgt", "education", "education.num", "maritalStatus", "occupation", "relationship", "race", "sex", "capitalGain", "capitalLoss", "hoursPerWeek", "nativeCountry", "isUpper"); adult <- read.table("adult.data", colClasses = cClasses, col.names=cNames, sep=",") adult$isUpper <- as.numeric(adult$isUpper == " >50K") cClasses[15] <- "numeric" summary(adult) #adult.test <- read.table("adult.test", colClasses = c("numeric", "factor", "numeric", "factor", "numeric", "factor", "factor", "factor", "factor", "character", "numeric", "numeric", "numeric", "factor", "factor"), sep=",", skip=1) #colnames(adult.test) <-cnames; #adult.test$isUpper <- as.factor(as.numeric(adult.test$isUpper == " >50K")) ## division de l'échantillons n <- 1000; # taille de l'échantillon d'apprentissage nv <- 2000 # taille de l'échantillon de validation K <- sample(1:nrow(adult), n+nv) train <- adult[K[1:n],] valid <- adult[K[(n+1):(n+nv)],] test <- adult[-K,] ## fonction utiles t <- seq(0, 90, 0.1) MCrisk <-function(classif){ mean(classif(test) != test$isUpper) } # pour dessiner les frontières des zones de décision d'un arbre issu du package tree linesTree <- function(tr){ f <- tr$frame; lx <- c(); ly <- c(); for (j in 1:dim(f)[1]){ var <- f[j,1]; if((var == "age") || (var == "education.num")){ b <- as.double(substr(f[j,5][1], 2,5)) if (var == "age"){ lx <- c(lx,b); lines(c(b,b), c(0,18), lwd = 3); } else{ ly <- c(ly, b); lines(c(15, 88), c(b,b), lwd = 3); } } } } #require(lhs) drawRule <- function(classif){ #N <- 1000; # X <- maximinLHS(n=N, k=2); X[,1] <- 15+73*X[,1]; X[,2] <- 17*X[,2]; N <- 50; X <- array(0, dim = c(N^2, 2)); for(i in 1:N) for(j in 1:N) {X[(N-1)*i+j,] <- c(15+73*i/N, 17*j/N); } points(X[,1], X[,2], col=c("pink", "cyan")[1+classif(data.frame(age=X[,1], education.num=X[,2]))], pch=15, cex=2.5); } ## Visualisation des données #pdf("salaires.html") plot(train$age+0.2*rnorm(n), train$education.num+0.2*rnorm(n), col = c("red", "blue")[1+train$isUpper], pch=20, xlab="age", ylab="education.num", main="salaire (bleu: >=50K$, rouge: <50k$)") #dev.off() ## classifieur majoritaire classeMaj <- as.numeric(mean(train$isUpper)>1/2); maj.rule <- function(feature){ rep(classeMaj, nrow(feature)); } confusion <- table(maj.rule(test), test$isUpper) cat('table de confusion pour le classifieur majoritaire : '); print(confusion) majoritaire.risk <-(confusion[1,2])/sum(confusion) cat(paste("risque du classifieur majoritaire : ", majoritaire.risk)) ## modèle génératif fake <- data.frame(age=sample(adult$age, n), education.num=sample(adult$education.num, n)); fake$isUpper <- runif(n)0.5); } drawRule(bayes.rule) points(fake$age+0.2*rnorm(n), fake$education.num+0.2*rnorm(n), col = c("red", "blue")[1+fake$isUpper], pch=20, xlab="age", ylab="education.num", main="données simulées (bleu: >=50K$, rouge: <50k$)") lines(t, 10+(t-45)^2/25, lwd=6, col='magenta') bayes.fake.confusion <- table(bayes.rule(test), test$isUpper) cat('table de confusion du classifieur bayésien du modèle génératif: '); print(bayes.fake.confusion) bayes.fake.risk <-(bayes.fake.confusion[1,2] + bayes.fake.confusion[2,1])/sum(bayes.fake.confusion) cat(paste("risque du classifieur bayésien du modèle génératif: ", bayes.fake.risk)) ## k-NN require(class) knn.rule <- function(feature, k=20){ as.numeric(knn(train[, c("age", "education.num")], feature[, c("age", "education.num")], factor(train$isUpper), k))-1 } plot(0,0, xlim = c(15,88), ylim = c(0,17), main="k-NN", xlab="age", ylab="education.num"); drawRule(knn.rule) points(train$age+0.2*rnorm(n), train$education.num+0.2*rnorm(n), col = c("red", "blue")[1+train$isUpper], pch=20, xlab="age", ylab="education.num", main="données simulées (bleu: >=50K$, rouge: <50k$)") k <- 15; knn(train[, c("age", "education.num")], list(age=45, education.num = 15), factor(train$isUpper), k, prob=TRUE) knn(train[, c("age", "education.num")], list(age=20, education.num = 5), factor(train$isUpper), k, prob=TRUE) knn(train[, c("age", "education.num")], list(age=85, education.num = 15), factor(train$isUpper), k, prob=TRUE) Ik <- c(1:10, seq(10, 200, 20)); N <- length(Ik) knn.est.risk <- array(0, N) for (i in 1:N){ knn.est.risk[i] <- mean(knn.rule(valid, Ik[i]) != valid$isUpper); print(paste(floor(100*i/N), '%')) } kopt <- Ik[which.min(knn.est.risk)] knn.confusion <- table(knn.rule(test, kopt), test$isUpper) knn.risk <-(knn.confusion[1,2] + knn.confusion[2,1])/sum(knn.confusion) cat('table de confusion pour le meilleur classifieur k-NN : '); print(knn.confusion) cat(paste("risque du meilleur classifieur k-NN : ", knn.risk, "(risque estimé ", knn.est.risk[kopt],")", "atteint pour k = ", kopt)) plot(Ik, knn.est.risk, type='o', xlab='k', main='Risque de k-NN en fonction du nombre de voisins') ## régression linéaire # préliminaire plot(train$education.num+0.1*rnorm(n), train$isUpper) reglinsimple <- lm(isUpper ~ education.num, data=train).html summary(reglinsimple) abline(reglinsimple) # début lm.reg <- lm(isUpper ~ age + education.num, data=train).html summary(lm.reg) lm.rule <- function(feature) { as.numeric(predict(lm.reg, newdata=feature)>0.5) } plot(0,0, xlim = c(15,88), ylim = c(0,17), main="classification linéaire", xlab="age", ylab="education.num"); drawRule(lm.rule); points(train$age+0.2*rnorm(n), train$education.num+0.2*rnorm(n), col = c("red", "blue")[1+as.numeric(train$isUpper)], pch=20, xlab="age", ylab="education.num", main="salaire (bleu: >=50K$, rouge: <50k$)") lines(t, (0.5 - lm.reg$coefficients[1] - lm.reg$coefficients[2]*t) / lm.reg$coefficients[3] , lwd=6, col='magenta') lm.confusion <- table(lm.rule(test), test$isUpper) cat('table de confusion pour le classifieur issu de la régression linéaire : '); print(lm.confusion) lm.risk <-(lm.confusion[1,2] + lm.confusion[2,1])/sum(lm.confusion) cat(paste("risque du classifieur issu de la régression linéaire : ", lm.risk)) predict(lm.reg, list(age=45, education.num = 15)) predict(lm.reg, list(age=20, education.num = 5)) predict(lm.reg, list(age=85, education.num = 15)) ## régression logistique # on commence seulement avec les deux variables log.reg <- glm(isUpper ~ age + education.num, data=train%2c.html family = "binomial") summary(log.reg) log.rule <- function(feature) { as.numeric(predict(log.reg, newdata=feature)>0) # ou bien # as.numeric(predict(log.reg, newdata=feature, type="response")>0.5) } plot(0,0, xlim = c(15,88), ylim = c(0,17), main="classification par régression logistique", xlab="age", ylab="education.num") drawRule(log.rule); points(train$age+0.2*rnorm(n), train$education.num+0.2*rnorm(n), col = c("red", "blue")[1+as.numeric(train$isUpper)], pch=20, xlab="age", ylab="education.num", main="salaire (bleu: >=50K$, rouge: <50k$)") lines(t, (- log.reg$coefficients[1] - log.reg$coefficients[2]*t) / log.reg$coefficients[3] , lwd=6, col='magenta') log.confusion <- table(log.rule(test), test$isUpper) cat('table de confusion pour le classifieur issu de la régression logistique : '); print(log.confusion) log.risk <-(log.confusion[1,2] + log.confusion[2,1])/sum(log.confusion) cat(paste("risque du classifieur issu de la régression logistique : ", log.risk)) predict(log.reg, list(age=45, education.num = 15), type="response", se.fit=TRUE) predict(log.reg, list(age=20, education.num = 5), type="response", se.fit=TRUE) predict(log.reg, list(age=85, education.num = 15), type="response", se.fit=TRUE) ## Choix de modèle pour la régression logistique : méthode backward # on part d'un modèle trop riche, par exemple log.goodreg <- glm(isUpper ~ age+workclass+education.num+maritalStatus+occupation+relationship+race+sex+capitalGain+capitalLoss+hoursPerWeek, data=train%2c.html family = "binomial") anova(log.goodreg) # on voit que 'race' et 'maritalStatus' ne sont pas significatifs. # On enlève le moins significatif = 'race' log.goodreg <- glm(isUpper ~ age+workclass+education.num+maritalStatus+occupation+relationship+sex+capitalGain+capitalLoss+hoursPerWeek, data=train%2c.html family = "binomial") # rq: on peut aussi le faire avec la commande 'step' anova(log.goodreg) # on voit que 'maritalStatus' n'est toujours pas significatifs, ni maintenant 'sex'. # On enlève le moins significatif = 'maritalStatus' log.goodreg <- glm(isUpper ~ age+workclass+education.num+occupation+relationship+sex+capitalGain+capitalLoss+hoursPerWeek, data=train%2c.html family = "binomial") anova(log.goodreg) # on voit que 'sex' n'est toujours pas significatifs. # On l'enlève log.goodreg <- glm(isUpper ~ age+workclass+education.num+occupation+relationship+capitalGain+capitalLoss+hoursPerWeek, data=train%2c.html family = "binomial") anova(log.goodreg) # Toutes les variables sont désormais significatives, on garde ce modèle pour la prédiction # Remarque : on peut lui laisser faire la régression logistique complète avec sélection de modèle par les deux lignes suivantes : log.fullreg <- glm(isUpper ~ age+workclass+education+education.num+maritalStatus+occupation+relationship+race+sex+capitalGain+capitalLoss+hoursPerWeek+nativeCountry, data=train%2c.html family = "binomial") summary(log.fullreg) # on le laisse choisir le modèle tout seul log.fullreg <- step(log.fullreg, direction="backward",trace=FALSE) ## régression logistique : utilisation du modèle retenu log.full.rule <- function(feature) { res <- as.numeric(predict(log.fullreg, newdata=feature)>0) res[is.na(res)] <- runif(1)<0.5; ## quelques valeurs ne sont pas disponibles, pour éviter les bugs on les classe arbitrairement res } # ATTENTION : il faut traiter à part les niveaux pas encore vus test.sav <- test; for (pred in which(cClasses == "factor")){ tab <- table(train[, pred]); id <- which(!(test[, pred] %in% levels(train[, pred])) | tab[test[, pred]] == 0) if (length(id)>0) {test[id, pred] <- NA} } log.full.confusion <- table(log.full.rule(test), test$isUpper) test <- test.sav cat('table de confusion pour le classifieur issu de la régression logistique complète : '); print(log.full.confusion) log.full.risk <-(log.full.confusion[1,2] + log.full.confusion[2,1])/sum(log.full.confusion) cat(paste("risque du classifieur issu de la régression logistique : ", log.full.risk)) ## Algorithme CART # on peut utiliser soit le package 'rpart', soit 'tree' require(tree) cart.pred <- tree(factor(isUpper) ~ age + education.num, control=tree.control(n, mindev=0, minsize=10), data=train).html cart.pred summary(cart.pred) plot(train$age+0.2*rnorm(n), train$education.num+0.2*rnorm(n), col = c("red", "blue")[1+as.numeric(train$isUpper)], pch=20, xlab="age", ylab="education.num", main="salaire (bleu: >=50K$, rouge: <50k$)") linesTree(cart.pred) predict(cart.pred, newdata=list(age=31, education.num=12)) points(31, 12, pch=19, col='magenta') predict(cart.pred, newdata=list(age=31, education.num=12), type="class") cart.rule <- function(feature){ as.numeric(predict(cart.pred, newdata=feature, type="class"))-1; } table(cart.rule(test), test$isUpper) cart.risk <-MCrisk(cart.rule) cat(paste("risque du classifieur issu de CART non élagué : ", cart.risk)) cart.confusion <- table(cart.rule(test), test$isUpper) cat('table de confusion pour le classifieur CART non élagué : '); print(cart.confusion) cart.risk <-(cart.confusion[1,2] + cart.confusion[2,1])/sum(cart.confusion) cat(paste("risque du classifieur CART non élagué : ", cart.risk plot(0,0, xlim = c(15,88), ylim = c(0,17), main="CART", xlab="age", ylab="education.num"); drawRule(cart.rule) points(train$age+0.2*rnorm(n), train$education.num+0.2*rnorm(n), col = c("red", "blue")[1+train$isUpper], pch=20, xlab="age", ylab="education.num", main="données simulées (bleu: >=50K$, rouge: <50k$)") linesTree(cart.pred) nbLeaves <- 20 cart.pruned <- prune.tree(cart.pred, best = nbLeaves, method = "misclass") cart.pruned summary(cart.pruned) cart.pruned.rule <- function(feature){ as.numeric(predict(cart.pruned, newdata=feature, type="class"))-1; } plot(0,0, xlim = c(15,88), ylim = c(0,17), main="CART", xlab="age", ylab="education.num"); drawRule(cart.pruned.rule) linesTree(cart.pruned) cart.pruned.confusion <- table(cart.pruned.rule(test), test$isUpper) cat('table de confusion pour le classifieur CART élagué : '); print(cart.pruned.confusion) cart.pruned.risk <-(cart.pruned.confusion[1,2] + cart.pruned.confusion[2,1])/sum(cart.pruned.confusion) cat(paste("risque du classifieur CART élagué : ", cart.pruned.risk)) # choix du paramètre 'best' validation croisée V-fold V <- 5; I <- array(0, dim=c(5, n/V)) reste <- 1:n for (i in 1:(V-1)){ I[i,] <- sample(reste, n/V) reste <- setdiff(reste, I[i,]) } I[V,] <- reste; Ibest <- c(2:10, seq(12, 20, 2), 30); N <- length(Ibest) scor <- array(0, N) for (i in 1:V){ cv_test <- train[I[i,], ] cv_train <- train[setdiff(1:n, I[i,]), ] arbre <- tree(factor(isUpper) ~ age + education.num, control=tree.control(n, mindev=0, minsize=10), data=cv_train).html for (k in 1:N){ b <- Ibest[k]; pruned <- prune.tree(arbre, best = b, method = "misclass") predictions <- predict(pruned, newdata=cv_test, type="class") scor[k] <- scor[k] + sum(predictions != cv_test$isUpper) } } plot(Ibest, scor) bopt <- Ibest[which.min(scor)] arbre <- tree(factor(isUpper) ~ age + education.num, control=tree.control(n, mindev=0, minsize=10), data=train).html arbre.pruned <- prune.tree(arbre, best = bopt, method = "misclass") cart.cv.pruned.rule <- function(feature){ as.numeric(predict(arbre.pruned, newdata=feature, type="class"))-1; } cart.cv.pruned.confusion <- table(cart.cv.pruned.rule(test), test$isUpper) cat('table de confusion pour le classifieur CART élagué avec choix du nombre de noeuds par validation croisée : '); print(cart.cv.pruned.confusion) cart.cv.pruned.risk <-(cart.cv.pruned.confusion[1,2] + cart.cv.pruned.confusion[2,1])/sum(cart.cv.pruned.confusion) cat(paste("risque du classifieur CART élagué avec choix du nombre de noeuds par validation croisée : ", cart.cv.pruned.risk)) plot(0,0, xlim = c(15,88), ylim = c(0,17), main="CART avec validation croisée", xlab="age", ylab="education.num"); drawRule(cart.cv.pruned.rule) points(train$age+0.2*rnorm(n), train$education.num+0.2*rnorm(n), col = c("red", "blue")[1+train$isUpper], pch=20, xlab="age", ylab="education.num", main="données simulées (bleu: >=50K$, rouge: <50k$)") linesTree(arbre.pruned) ## Neural Networks library(nnet) # apprentissage nnet.fit <- nnet(factor(isUpper)~age+education.num, data=train%2c.html size=6, decay=1) summary(nnet.fit) Isize <- c(1:10, seq(10, 100, 10), 150); N <- length(Isize) nnet.risk <- array(0, N) for (i in 1:N){ nnet.fit <- nnet(factor(isUpper)~age+education.num, data=train%2c.html size=Isize[i], decay=1, trace=FALSE) nnet.risk[i] <- mean(as.numeric(predict(nnet.fit, newdata=valid)>0.5) != valid$isUpper); print(paste(floor(100*i/N), '%')) } sizeopt <- Isize[which.min(nnet.risk)] nnet.fit <- nnet(factor(isUpper)~age+education.num, data=train%2c.html size=sizeopt, decay=1) plot(Isize, nnet.risk) nnet.rule <- function(feature){ predict(nnet.fit, newdata=feature) > 1/2; } plot(0,0, xlim = c(15,88), ylim = c(0,17), main="Neural Network", xlab="age", ylab="education.num"); drawRule(nnet.rule) points(train$age+0.2*rnorm(n), train$education.num+0.2*rnorm(n), col = c("red", "blue")[1+train$isUpper], pch=20, xlab="age", ylab="education.num", main="données simulées (bleu: >=50K$, rouge: <50k$)") nnet.confusion <- table(nnet.rule(test), test$isUpper) cat('table de confusion du réseau de neurones : '); print(nnet.confusion) nnet.risk <-(nnet.confusion[1,2] + nnet.confusion[2,1])/sum(nnet.confusion) cat(paste("risque du réseau de neurones : ", nnet.risk)) ## RandomForest library(randomForest) rf.fit <- randomForest(factor(isUpper) ~ age + education.num, data=train).html print(rf.fit) print(round(rf.fit$importance, 2)) rf.rule <- function(feature){ as.numeric(predict(rf.fit, newdata=feature))-1 } plot(0,0, xlim = c(15,88), ylim = c(0,17), main="Random Forest", xlab="age", ylab="education.num"); drawRule(rf.rule) points(train$age+0.2*rnorm(n), train$education.num+0.2*rnorm(n), col = c("red", "blue")[1+train$isUpper], pch=20, xlab="age", ylab="education.num", main="données simulées (bleu: >=50K$, rouge: <50k$)") rf.confusion <- table(rf.rule(test), test$isUpper) cat('table de confusion du classifieur RandomForest : '); print(rf.confusion) rf.risk <-(rf.confusion[1,2] + rf.confusion[2,1])/sum(rf.confusion) cat(paste("risque du classifieur RandomForest : ", rf.risk)) # Random Forest sur toutes les variables quantitatives rf.full.fit <- randomForest(factor(isUpper) ~ age+education.num+capitalGain+capitalLoss+hoursPerWeek, data=train).html rf.full.rule <- function(feature){ as.numeric(predict(rf.full.fit, newdata=feature))-1 } rf.full.confusion <- table(rf.full.rule(test), test$isUpper) cat('table de confusion du classifieur RandomForest sur toutes les variables quantitatives: '); print(rf.full.confusion) rf.full.risk <-(rf.full.confusion[1,2] + rf.full.confusion[2,1])/sum(rf.full.confusion) cat(paste("risque du classifieur RandomForest sur toutes les variables quantitatives : ", rf.full.risk)) cat("importance des variables : ") print(round(rf.full.fit$importance, 2)) ## SVM library(e1071) svm.fit <- svm(factor(isUpper) ~ age + education.num, data=train%2c.html cost=1000) summary(svm.fit) predict(svm.fit, newdata=data.frame(age=31, education.num=12)) predict(svm.fit, newdata=data.frame(age=40, education.num=15)) svm.rule <- function(feature){ as.numeric(predict(svm.fit, newdata=feature))-1; } plot(0,0, xlim = c(15,88), ylim = c(0,17), main="SVM", xlab="age", ylab="education.num"); drawRule(svm.rule) points(train$age+0.2*rnorm(n), train$education.num+0.2*rnorm(n), col = c("red", "blue")[1+train$isUpper], pch=20, xlab="age", ylab="education.num", main="données simulées (bleu: >=50K$, rouge: <50k$)") svm.confusion <- table(svm.rule(test), test$isUpper) cat('table de confusion du classifieur SVM non optimisé : '); print(svm.confusion) svm.risk <-(svm.confusion[1,2] + svm.confusion[2,1])/sum(svm.confusion) cat(paste("risque du classifieur SVM non optimisé : ", svm.risk))