These scripts are associated with the following publication: “Classification and Ranking of Fermi LAT Gamma-ray Sources from the 3FGL Catalog Using Machine Learning Techniques”, Saz Parkinson, P. M. (HKU/LSR, SCIPP), Xu, H. (HKU), Yu, P. L. H. (HKU), Salvetti, D. (INAF-Milan), Marelli, M. (INAF-Milan), and Falcone, A. D. (Penn State), The Astrophysical Journal, 2016, in press (http://arxiv.org/abs/1602.00385)

NB: You are welcome to use these scripts for your own purposes, but if you do so, we kindly ask that you cite the above publication.

Pulsar (YNG vs MSP) classification using Random Forests (RF)

Load workspace from previous step:

load(".RData")

Load randomForest and pROC packages:

library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Set random seed:

set.seed(1)

Full model:

rf.P <- randomForest(pulsarness ~., data = FGL3_Pulsars_train, importance = TRUE)
print(rf.P)
## 
## Call:
##  randomForest(formula = pulsarness ~ ., data = FGL3_Pulsars_train,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 10.1%
## Confusion matrix:
##     MSP YNG class.error
## MSP  43   4  0.08510638
## YNG   6  46  0.11538462

The importance of each variable:

importance(rf.P)
##                           MSP        YNG MeanDecreaseAccuracy
## GLAT               11.8362611 15.4978565           15.4138510
## Spectral_Index      3.3411886  0.9654453            3.0658757
## Variability_Index  -1.0281422 -3.4615356           -3.1882980
## Conf_95_PosAng      0.3733218 -1.4002385           -0.5706251
## Flux_Density       10.5865542  4.4787057           10.1160192
## Unc_Energy_Flux100 26.6749033 25.3375128           30.3358989
## Signif_Curve        8.1572730  4.7320722            8.7531204
## hr12               -3.1783394 -0.4783802           -2.7191745
## hr23                5.7989452  4.5546635            6.6805685
## hr34                1.9491817  3.7728334            3.6962043
## hr45               -1.0843239  4.3592551            2.1444392
##                    MeanDecreaseGini
## GLAT                       8.600107
## Spectral_Index             2.055341
## Variability_Index          1.415482
## Conf_95_PosAng             1.481118
## Flux_Density               3.950428
## Unc_Energy_Flux100        18.073279
## Signif_Curve               3.764994
## hr12                       1.242599
## hr23                       3.412330
## hr34                       2.615104
## hr45                       2.240045
varImpPlot(rf.P)

Model predictions:

predictions_FGL3_Pulsars_train <- predict(rf.P,
                                          newdata = FGL3_Pulsars_train, type = "Prob")[,2 ]
predictions_FGL3_Pulsars_test <- predict(rf.P,
                                         newdata = FGL3_Pulsars_test, type = "Prob")[,2 ]

Generate ROC plots and print contingency tables:

Best_threshold_train <- ROC_threshold_plots_tables(FGL3_Pulsars_train$pulsarness,
                                                   predictions_FGL3_Pulsars_train,
                                                   FGL3_Pulsars_test$pulsarness,
                                                   predictions_FGL3_Pulsars_test,
                                                   cat1 = "YNG", cat2 = "MSP")

##                    real_category
## Predict_class_train MSP YNG
##                 MSP  47   0
##                 YNG   0  52
##                   real_category
## Predict_class_test MSP YNG
##                MSP  16   2
##                YNG   2  23

Add RF Prediction probabilities to FGL3_results data frame:

FGL3_results$RF_PSR_P <- round(predict(rf.P,
                                       newdata = FGL3_tidy, type = "Prob")[,2 ],
                               digits = 3)

Add RF Prediction category to FGL3_results data frame:

FGL3_results$RF_PSR_Pred <- ifelse(FGL3_results$RF_PSR_P > Best_threshold_train[1],
                                   c("YNG"), c("MSP"))

Remove “pulsar-type” category prediction for those sources predicted to be AGN:

FGL3_results$RF_PSR_Pred[(FGL3_results$LR_Pred=="AGN")&(FGL3_results$RF_Pred=="AGN")]=""

Plot Importance:

par(mfrow = c(1,1))
varImpPlot(rf.P, pch = 19, type = 1, main = "")

Clean up and Set aside required data sets

Environ <- ls()
Environ <- Environ[Environ != "FGL3_tidy"
                   & Environ != "FGL3_results"
                   & Environ != "FGL3_test" 
                   & Environ != "FGL3_train" 
                   & Environ != "predictions_FGL3_train_CV" 
                   & Environ != "Block_index" 
                   & Environ != "FGL3_train_CV"
                   & Environ != "ROC_threshold_plots_tables"
                   & Environ != "FGL3_Pulsars_train"
                   & Environ != "FGL3_Pulsars_test"]
rm(list = Environ)

Save current workspace for subsequent steps

save.image()