It identifies PV and non-PV consumers using a set of linear classifiers, i.e., Naive Bayesian, Support Vector Machine, Generalized Linear Model, Linear Discriminate Analysis and Perceptron.

classify.pv.npv(data.train, data.test, trControl)

Arguments

data.train

The training dataset presented as a dataframe. Each row of the dataframe demonstrates the features of one consumer, and the row number is the number of consumers. The last column should be the class label.

data.test

The testing dataset presented as a dataframe. Each row of the dataframe demonstrate the features of one consumer, and the row number is the number of consumers. The last column should be the class label.

trControl

A list of values that define how these linear classifiers been trained using the package 'caret'. See trainControl and http://topepo.github.io/caret/using-your-own-model-in-train.html.

Value

model.name --- A character vector used to demonstrate the linear classifiers used in this function.

results --- A numeric vector of testing accuracy obtained from the correspondence classifiers listed in 'model.name'.

Examples

### Load the required packages. rm(list=ls()) library(caret) library(matrixStats) library(GGally)
#> Registered S3 method overwritten by 'GGally': #> method from #> +.gg ggplot2
### Load the smart meter readings and the simulated PV generations. data("loadsample") data("pvSample") ####################################################### ## Get the load profiles of both PV and non-PV consumers. N.house<-ncol(loadsample[,-c(1:6)]) # N.house is the total number of households. NPV.day<-lapply(loadsample[,-c(1:6)], load.daily, load.date=loadsample[,1], num.obs=48) # Assuming 30% of the households use PV. Generate the PV samples. # Remark that in a real experiment there should be hundreds or thousands of simulation replication. # Only one replication is shown below for the instant instruction. perc=0.3 pv.house<-floor(N.house*perc) M<-ncol(pvSample[,-c(1:6)]) pv.rand<-sample(1:M, pv.house, replace = TRUE, prob = rep(1/M, times=M)) pvSample<-cbind(pvSample[ ,c(1:6)], pvSample[ ,-c(1:6)][ ,pv.rand]) date.PV.NPV<-unique(pvSample$date) PV.day<-lapply(pvSample[,-c(1:6)], load.daily, load.date=pvSample$date, num.obs=48) house_pv<-sample(N.house)[1:pv.house] # Randomly select the PV households. # Compute the real load demand of the PV consumers using the smart meter reading (loadsample), # and the simulated PV samples (pvSample). demand_pv<-vector(mode = "list", length = length(PV.day)) for (k in 1:pv.house){ demand_pv[[k]]<-NPV.day[[house_pv[k]]]-PV.day[[k]] } demand_npv<-NPV.day[-house_pv] # Label the PV consumers with 0, and non-PV consumers with 1. label.PV<-c(rep(0, times=length(house_pv)), rep(1, times=length(demand_npv))) label.PV<-factor(label.PV) ############ Prepare the features used to identify PV and non-PV users. date.week<-weekdays(as.Date(date.PV.NPV)) x.week<-date.week mor.be<-13 aft.be<-39 feature.PV<-lapply(demand_pv, features.load, x.week, mor.be, aft.be) feature.NPV<-lapply(demand_npv, features.load, x.week, mor.be, aft.be) features.name<-feature.PV[[1]]$features.name # Get the name of the computed features. # Prepare the feature matrix. feature.PV.new<-feature.PV[[1]]$output.features for (i in 2:length(feature.PV)){ feature.PV.new<-rbind(feature.PV.new, feature.PV[[i]]$output.features) } feature.NPV.new<-feature.NPV[[1]]$output.features for (i in 2:length(feature.NPV)){ feature.NPV.new<-rbind(feature.NPV.new, feature.NPV[[i]]$output.features) } ##### Compute the significance of each feature and select the 12 most significant features. feature.PV.NPV<-rbind(feature.PV.new, feature.NPV.new) p.value<-apply(feature.PV.NPV, 2, features.importance, label.PV) p.value<-sort(p.value, decreasing = FALSE, index.return=TRUE) features.order<-features.name[p.value$ix] PV_feature<-feature.PV.new[,p.value$ix[1:12]] NPV_feature<-feature.NPV.new[ ,p.value$ix[1:12]] ##### Visualize the differences of between the features of PV and non-PV consumers. ##### Top 5 features are selected for example. PV.NPV<-data.frame(rbind(PV_feature[,1:5], NPV_feature[,1:5])) PV.NPV$cls<-label.PV PV.NPV<-PV.NPV[sample(nrow(PV.NPV)), ] PV.NPV$cls<- factor(PV.NPV$cls, levels = c(0,1), labels = c("PV", "non-PV")) options(repr.P.width=14,repr.P.height=10) ggpairs(PV.NPV, title="The 5 most significant features", mapping=ggplot2::aes(colour = cls))
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
######### Use the above selected features to classify PV and non-PV consumers. #### Prepare the training and testing datasets. set.seed(123) # 70% of the data (PV and non-PV) are used for training, and the remainder are used for testing train.data<-0.7 # Prepare the training and testing datasets of PV consumers. n.pv<-sample(nrow(PV_feature)) data.train.pv<-as.matrix(PV_feature[n.pv[1:floor(train.data*length(n.pv))], ]) data.test.pv<-as.matrix(PV_feature[n.pv[-c(1:floor(train.data*length(n.pv)))], ]) pv.cls.test<-rep(0, times=nrow(data.test.pv)) pv.cls.train<-rep(0, times=nrow(data.train.pv)) # Prepare the training and testing datasets of non-PV consumers. n.npv<-sample(nrow(NPV_feature)) data.train.npv<-as.matrix(NPV_feature[n.npv[1:floor(train.data*length(n.npv))], ]) data.test.npv<-as.matrix(NPV_feature[n.npv[-c(1:floor(train.data*length(n.npv)))], ]) npv.cls.test<-rep(1, times=nrow(data.test.npv)) npv.cls.train<-rep(1, times=nrow(data.train.npv)) ### Composite the training dataset. data.train<-as.data.frame(rbind(data.train.pv, data.train.npv)) data.train$cls<-c(pv.cls.train, npv.cls.train) data.train$cls<-factor(data.train$cls, levels = c(0,1), labels = c("False", "True")) ### Composite the testing dataset. data.test<-as.data.frame(rbind(data.test.pv, data.test.npv)) data.test$cls<-c(pv.cls.test, npv.cls.test) data.test$cls<-factor(data.test$cls, levels = c(0,1), labels = c("False", "True")) # 5-Fold cross-validation is used for model selection. classify.results<-classify.pv.npv( data.train, data.test, trControl=trainControl(method='cv',number=5)) # Print the testing results. classify.results
#> $model.name #> [1] "Naive Bayesian" "Support Vector Machine" #> [3] "Generalized Linear Model" "Linear Discriminate Analysis" #> [5] "Perceptron" #> #> $results #> [1] 1.0000000 1.0000000 1.0000000 0.9893048 1.0000000 #>