These scripts are associated with the following publication: “Classification and Ranking of Fermi LAT Gamma-ray Sources from the 3FGL Catalog Using Machine Learning Techniques”, Saz Parkinson, P. M. (HKU/LSR, SCIPP), Xu, H. (HKU), Yu, P. L. H. (HKU), Salvetti, D. (INAF-Milan), Marelli, M. (INAF-Milan), and Falcone, A. D. (Penn State), The Astrophysical Journal, 2016, in press (http://arxiv.org/abs/1602.00385)

NB: You are welcome to use these scripts for your own purposes, but if you do so, we kindly ask that you cite the above publication.

AGN vs PSR classification using Random Forests (RF)

Load workspace from previous step

load(".RData")

Load randomForest and pROC packages

library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
set.seed(1)

First use 10-fold cross validation method to fit models and get the forecast of each block of data

k <- 10
for (i in 1:k) {
  Model <- randomForest(pulsarness~., data = FGL3_train[-Block_index[i, ], ], importance = TRUE)
  predictions_FGL3_train_CV[((i-1)*(nrow(FGL3_train_CV)/k) + 1):((i)*(nrow(FGL3_train_CV)/k))] <-
    predict(Model, newdata = FGL3_train[Block_index[i, ], ], type = "Prob")[, 2]
  print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10

Get best threshold

Best_threshold_train_CV <- ROC_threshold_plots_tables(FGL3_train_CV$pulsarness, predictions_FGL3_train_CV)

Now Modeling using all the data

rf.full.FGL3 <- randomForest(pulsarness~., data = FGL3_train, importance = TRUE)
# The importance of each variable
importance(rf.full.FGL3)
##                    Non-Pulsar    Pulsar MeanDecreaseAccuracy
## Spectral_Index       25.56036 17.632979             29.46206
## Variability_Index    31.64328 18.641224             33.66110
## Flux_Density         13.86492  7.538865             14.86319
## Unc_Energy_Flux100   32.74000 23.883328             36.91021
## Signif_Curve         33.10258 29.794806             40.35179
## hr12                 10.67109  5.647244             12.24037
## hr23                 19.39184 10.596104             21.76137
## hr34                  9.85729  9.927413             12.63419
## hr45                 19.86705 21.374217             24.13753
##                    MeanDecreaseGini
## Spectral_Index            41.850994
## Variability_Index         20.837394
## Flux_Density               9.430220
## Unc_Energy_Flux100        28.467787
## Signif_Curve              63.025025
## hr12                       9.666265
## hr23                      13.250555
## hr34                       6.010382
## hr45                      19.210694
varImpPlot(rf.full.FGL3)

Prediction for train and test

predictions_FGL3_train <- predict(rf.full.FGL3, newdata = FGL3_train, type = "Prob")[, 2]
predictions_FGL3_test <- predict(rf.full.FGL3, newdata = FGL3_test, type = "Prob")[,2 ]

Generate ROC plots, and print contingency tables

ROC_threshold_plots_tables(FGL3_train$pulsarness, predictions_FGL3_train, FGL3_test$pulsarness, predictions_FGL3_test, 
                           threshold = Best_threshold_train_CV[1])

##                    real_category
## Predict_class_train Non-Pulsar Pulsar
##              AGN          1215      0
##              Pulsar          2    116
##                   real_category
## Predict_class_test Non-Pulsar Pulsar
##             AGN           504      2
##             Pulsar         17     48
## [1] 0.172
cat("Best threshold from cross-validation:", Best_threshold_train_CV)
## Best threshold from cross-validation: 0.172 0.9695222 0.9310345

Add RF Prediction probabilities to FGL3_results

FGL3_results$RF_P <- predict(rf.full.FGL3, newdata = FGL3_tidy, type = "Prob")[,2 ]

Add RF Prediction category to FGL3_results

FGL3_results$RF_Pred <- ifelse(FGL3_results$RF_P > Best_threshold_train_CV[1], 
                                     c("PSR"), c("AGN")) 

Plot for the paper

par(mfrow = c(1,1))
varImpPlot(rf.full.FGL3, pch = 19, type = 1, main = "")

Clean up and Set aside required data sets

Environ <- ls()
Environ <- Environ[Environ != "FGL3_tidy"
                   & Environ != "FGL3_results"
                   & Environ != "FGL3_test" 
                   & Environ != "FGL3_train" 
                   & Environ != "predictions_FGL3_train_CV" 
                   & Environ != "Block_index" 
                   & Environ != "FGL3_train_CV"
                   & Environ != "ROC_threshold_plots_tables"
                   & Environ != "FGL3_Pulsars_train"
                   & Environ != "FGL3_Pulsars_test"
                   & Environ != "rf.full.FGL3"]
rm(list = Environ)

Save current workspace for subsequent steps

save.image()