Random Forest in R ..Recommending College based on Rank and Branch Choice
R Notebook-College and Rank
This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
ds<-read.csv("collegedatainput.csv")
head(ds)  Rank College Branch
1    1    IITB    CSE
2    3    IITB    CSE
3   10    IITB    CSE
4   20    IITB    CSE
5   30    IITB    CSE
6   40    IITB    CSE
str(ds)
'data.frame':   47 obs. of  3 variables:
 $ Rank   : int  1 3 10 20 30 40 50 260 270 280 ...
 $ College: Factor w/ 3 levels "IITB","IITD",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Branch : Factor w/ 3 levels "CSE","EEE","MECH": 1 1 1 1 1 1 1 2 2 2 ...
ds
   Rank College Branch
1     1    IITB    CSE
2     3    IITB    CSE
3    10    IITB    CSE
4    20    IITB    CSE
5    30    IITB    CSE
6    40    IITB    CSE
7    50    IITB    CSE
8   260    IITB    EEE
9   270    IITB    EEE
10  280    IITB    EEE
11  290    IITB    EEE
12  300    IITB    EEE
13  360    IITB   MECH
14  370    IITB   MECH
15  380    IITB   MECH
16  390    IITB   MECH
17  400    IITB   MECH
18   60    IITM    CSE
19   70    IITM    CSE
20   80    IITM    CSE
21   90    IITM    CSE
22  100    IITM    CSE
23  210    IITM   MECH
24  220    IITM   MECH
25  230    IITM   MECH
26  240    IITM   MECH
27  250    IITM   MECH
28  310    IITM    EEE
29  320    IITM    EEE
30  330    IITM    EEE
31  340    IITM    EEE
32  350    IITM    EEE
33  110    IITD    CSE
34  120    IITD    CSE
35  130    IITD    CSE
36  140    IITD    CSE
37  150    IITD    CSE
38  160    IITD   MECH
39  170    IITD   MECH
40  180    IITD   MECH
41  190    IITD   MECH
42  200    IITD   MECH
43  410    IITD    EEE
44  420    IITD    EEE
45  430    IITD    EEE
46  440    IITD    EEE
47  450    IITD    EEE
library(randomForest)nrow(ds)
smp_size <- floor(0.75 * nrow(ds))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(ds)), size = smp_size)
train_ds <- ds[train_ind, ]
test_ds <- ds[-train_ind, ]colleg_rf <- randomForest(College~.,data=train_ds,ntree=100,proximity=TRUE)
colleg_rf
Call:
 randomForest(formula = College ~ ., data = train_ds, ntree = 100,      proximity = TRUE) 
               Type of random forest: classification
                     Number of trees: 100
No. of variables tried at each split: 1
        OOB estimate of  error rate: 20%
Confusion matrix:
     IITB IITD IITM class.error
IITB   13    0    0   0.0000000
IITD    3    7    1   0.3636364
IITM    2    1    8   0.2727273
table(predict(colleg_rf),train_ds$College)
      
       IITB IITD IITM
  IITB   13    3    2
  IITD    0    7    1
  IITM    0    1    8
varImpPlot(colleg_rf)
 
collegePred<-predict(colleg_rf,newdata=test_ds)
nrow(test_ds)
[1] 12
table(collegePred, test_ds$College)
           
collegePred IITB IITD IITM
       IITB    4    0    3
       IITD    0    3    0
       IITM    0    1    1
head(test_ds)
   Rank College Branch
5    30    IITB    CSE
7    50    IITB    CSE
9   270    IITB    EEE
13  360    IITB   MECH
26  240    IITM   MECH
30  330    IITM    EEE
testinput<-data.frame("Rank"=100,"Branch"="CSE")
testinput  Rank Branch
1  100    CSE
str(testinput)
'data.frame':   1 obs. of  2 variables:
 $ Rank  : num 100
 $ Branch: Factor w/ 1 level "CSE": 1
testin<-read.csv("TestIn.csv")
str(testin)'data.frame':   2 obs. of  2 variables:
 $ Rank  : int  100 310
 $ Branch: Factor w/ 1 level "CSE": 1 1
nrow(testin)
[1] 2
levels(testin$Branch) <- levels(train_ds$Branch)
str(testin)
'data.frame':   2 obs. of  2 variables:
 $ Rank  : int  100 310
 $ Branch: Factor w/ 3 levels "CSE","EEE","MECH": 1 1
testin
  Rank Branch
1  100    CSE
2  310    CSE
predict(colleg_rf,testin,type="response")   1    2 
IITM IITD 
Levels: IITB IITD IITM
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
ds<-read.csv("collegedatainput.csv")
head(ds)  Rank College Branch
1    1    IITB    CSE
2    3    IITB    CSE
3   10    IITB    CSE
4   20    IITB    CSE
5   30    IITB    CSE
6   40    IITB    CSEstr(ds)'data.frame':   47 obs. of  3 variables:
 $ Rank   : int  1 3 10 20 30 40 50 260 270 280 ...
 $ College: Factor w/ 3 levels "IITB","IITD",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Branch : Factor w/ 3 levels "CSE","EEE","MECH": 1 1 1 1 1 1 1 2 2 2 ...ds   Rank College Branch
1     1    IITB    CSE
2     3    IITB    CSE
3    10    IITB    CSE
4    20    IITB    CSE
5    30    IITB    CSE
6    40    IITB    CSE
7    50    IITB    CSE
8   260    IITB    EEE
9   270    IITB    EEE
10  280    IITB    EEE
11  290    IITB    EEE
12  300    IITB    EEE
13  360    IITB   MECH
14  370    IITB   MECH
15  380    IITB   MECH
16  390    IITB   MECH
17  400    IITB   MECH
18   60    IITM    CSE
19   70    IITM    CSE
20   80    IITM    CSE
21   90    IITM    CSE
22  100    IITM    CSE
23  210    IITM   MECH
24  220    IITM   MECH
25  230    IITM   MECH
26  240    IITM   MECH
27  250    IITM   MECH
28  310    IITM    EEE
29  320    IITM    EEE
30  330    IITM    EEE
31  340    IITM    EEE
32  350    IITM    EEE
33  110    IITD    CSE
34  120    IITD    CSE
35  130    IITD    CSE
36  140    IITD    CSE
37  150    IITD    CSE
38  160    IITD   MECH
39  170    IITD   MECH
40  180    IITD   MECH
41  190    IITD   MECH
42  200    IITD   MECH
43  410    IITD    EEE
44  420    IITD    EEE
45  430    IITD    EEE
46  440    IITD    EEE
47  450    IITD    EEElibrary(randomForest)nrow(ds)
smp_size <- floor(0.75 * nrow(ds))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(ds)), size = smp_size)
train_ds <- ds[train_ind, ]
test_ds <- ds[-train_ind, ]colleg_rf <- randomForest(College~.,data=train_ds,ntree=100,proximity=TRUE)
colleg_rf
Call:
 randomForest(formula = College ~ ., data = train_ds, ntree = 100,      proximity = TRUE) 
               Type of random forest: classification
                     Number of trees: 100
No. of variables tried at each split: 1
        OOB estimate of  error rate: 20%
Confusion matrix:
     IITB IITD IITM class.error
IITB   13    0    0   0.0000000
IITD    3    7    1   0.3636364
IITM    2    1    8   0.2727273table(predict(colleg_rf),train_ds$College)      
       IITB IITD IITM
  IITB   13    3    2
  IITD    0    7    1
  IITM    0    1    8varImpPlot(colleg_rf)collegePred<-predict(colleg_rf,newdata=test_ds)
nrow(test_ds)[1] 12table(collegePred, test_ds$College)           
collegePred IITB IITD IITM
       IITB    4    0    3
       IITD    0    3    0
       IITM    0    1    1head(test_ds)   Rank College Branch
5    30    IITB    CSE
7    50    IITB    CSE
9   270    IITB    EEE
13  360    IITB   MECH
26  240    IITM   MECH
30  330    IITM    EEEtestinput<-data.frame("Rank"=100,"Branch"="CSE")
testinput  Rank Branch
1  100    CSEstr(testinput)'data.frame':   1 obs. of  2 variables:
 $ Rank  : num 100
 $ Branch: Factor w/ 1 level "CSE": 1testin<-read.csv("TestIn.csv")
str(testin)'data.frame':   2 obs. of  2 variables:
 $ Rank  : int  100 310
 $ Branch: Factor w/ 1 level "CSE": 1 1nrow(testin)[1] 2levels(testin$Branch) <- levels(train_ds$Branch)
str(testin)'data.frame':   2 obs. of  2 variables:
 $ Rank  : int  100 310
 $ Branch: Factor w/ 3 levels "CSE","EEE","MECH": 1 1testin  Rank Branch
1  100    CSE
2  310    CSEpredict(colleg_rf,testin,type="response")   1    2 
IITM IITD 
Levels: IITB IITD IITM
Comments
Post a Comment