Partitioned Model

6.17 Partitioned Model

A partitioned model is an ensemble model that consists of multiple sub-models, one for each partition of the data.

A partitioned model may achieve better accuracy through multiple targeted models that are managed and used as one. A partitioned model can simplify scoring by allowing you to reference the top-level model only. The proper sub-model is chosen by the system based on the values of the partitioned column or columns for each row of data to be scored.

To create a partitioned in-database model, use the odm.setting argument with ODMS_PARTITION_COLUMNS as the name and with the names of the columns by which to partition the input data as the value. The OREdm function returns a model with a sub-model for each partition. The partitions are based on the unique values found in the columns.

The partitions function returns an ore.frame that lists each partition of the specified model object and the associated partition column values of the model. Partition names are system-determined. The function returns NULL for a non-partitioned model.

The performance of a high number of partitions (up to 32K component models) in a partitioned model is improved, and dropping a single model within a partition model is also improved.

Example 6-20 Create a Partitioned Model

This example creates a partitioned Support Vector Machine classification model. It uses the Wine Quality data set from the University of California, Irvine Machine Learning Repository.

# Download the wine data set and create the data table.
white.url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
white.wine <- read.csv(white.url, header = TRUE, sep = ";")
white.wine$color <- "white"

red.url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
red.wine <- read.csv(red.url, header = TRUE, sep = ";")
red.wine$color <- "red"

dat <- rbind(white.wine, red.wine)

# Drop the WINE table if it exists.
ore.drop(table="WINE")
ore.create(dat, table="WINE")

# Assign row names to enable row indexing for train and test samples.
row.names(WINE) <- WINE$color

# Enable reproducible results.
set.seed(seed=6218945)

n.rows        <- nrow(WINE)

# Train and test sampling.
random.sample <- sample(1:n.rows, ceiling(n.rows/2))

# Sample in-database using row indexing.
WINE.train    <- WINE[random.sample,]
WINE.test     <- WINE[setdiff(1:n.rows,random.sample),]

# Build a Support Vector Machine classification model 
# on the training data set, using both red and white wine.
mod.svm   <- ore.odmSVM(quality~.-pH-fixed.acidity, WINE.train, 
                        "classification", kernel.function="linear")

# Predict wine quality on the test data set.
pred.svm  <- predict (mod.svm, WINE.test,"quality")

# View the probability of each class and prediction.
head(pred.svm,3)

# Generate a confusion matrix. Note that 3 and 8 are not predicted.
with(pred.svm, table(quality, PREDICTION, dnn = c("Actual", "Predicted")))

# Build a partitioned SVM model based on wine color.
# Specify the partitioning column with the odm.settings argument.
mod.svm2   <- ore.odmSVM(quality~.-pH-fixed.acidity, WINE.train, 
                         "classification", kernel.function="linear",
                         odm.settings=list(odms_partition_columns = "color"))

# Predict wine quality on the test data set.
pred.svm2  <- predict (mod.svm2, WINE.test, "quality")

# View the probability of each class and prediction.
head(pred.svm2,3)

# Generate a confusion matrix. Note that 3 and 4 are not predicted.
with(pred.svm2, table(quality, PREDICTION, dnn = c("Actual", "Predicted")))

partitions(mod.svm2)
summary(mod.svm2["red"])

Listing for This Example

> # Download the wine data set and create the data table.
> white.url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
> white.wine <- read.csv(white.url, header = TRUE, sep = ";")
> white.wine$color <- "white"
> 
> red.url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
> red.wine <- read.csv(red.url, header = TRUE, sep = ";")
> red.wine$color <- "red"
> 
> dat <- rbind(white.wine, red.wine)
> 
> # Drop the WINE table if it exists.
> ore.drop(table="WINE")
Warning message:
Table WINE does not exist. 
> ore.create(dat, table="WINE")
> 
> # Assign row names to enable row indexing for train and test samples.
> row.names(WINE) <- WINE$color
> 
> # Enable reproducible results.
> set.seed(seed=6218945)                  
>
> n.rows        <- nrow(WINE)
>
> # Train and test sampling.
> random.sample <- sample(1:n.rows, ceiling(n.rows/2))
>
> # Sample in-database using row indexing.
> WINE.train    <- WINE[random.sample,]
> WINE.test     <- WINE[setdiff(1:n.rows,random.sample),]    
> 
> # Build a Support Vector Machine classification model
> # on the training data set, using both red and white wine.
> mod.svm   <- ore.odmSVM(quality~.-pH-fixed.acidity, WINE.train,
+                         "classification",kernel.function="linear")
>
> # Predict wine quality on the test data set.
> pred.svm  <- predict (mod.svm, WINE.test,"quality")
>
> # View the probability of each class and prediction.
> head(pred.svm,3)
            '3'       '4'        '5'       '6'       '7'       '8'       '9'
red   0.04957242 0.1345280 0.27779399 0.1345281 0.1345280 0.1345275 0.1345220
red.1 0.04301663 0.1228311 0.34283345 0.1228313 0.1228311 0.1228307 0.1228257
red.2 0.04473419 0.1713883 0.09832961 0.1713891 0.1713890 0.1713886 0.1713812
      quality PREDICTION
red         4          5
red.1       5          5
red.2       7          6
>
> # Generate a confusion matrix. Note that 3 and 4 are not predicted.
> with(pred.svm, table(quality,PREDICTION, dnn = c("Actual","Predicted")))
      Predicted
Actual   3   4   5   6   7   8   9
     3   0   0  11   5   0   0   0
     4   0   1  85  16   2   0   0
     5   2   1 927 152   4   0   1
     6   2   1 779 555  63   1   9
     7   2   0 121 316  81   0   3
     8   0   0  18  66  21   0   0
     9   0   0   0   2   1   0   0
>
> partitions(mod.svm2)
  PARTITION_NAME color
1            red   red
2          white white
> summary(mod.svm2["red"])
$red

Call:
ore.odmSVM(formula = quality ~ . - pH - fixed.acidity, data = WINE.train, 
    type = "classification", kernel.function = "linear", odm.settings = list(odms_partition_columns = "color"))

Settings: 
                                               value
clas.weights.balanced                            OFF
odms.details                             odms.enable
odms.max.partitions                             1000
odms.missing.value.treatment odms.missing.value.auto
odms.partition.columns                       "color"
odms.sampling                  odms.sampling.disable
prep.auto                                         ON
active.learning                            al.enable
conv.tolerance                                 1e-04
kernel.function                               linear

Coefficients: 
   PARTITION_NAME class             variable value      estimate
1             red     3          (Intercept)       -1.347392e+01
2             red     3              alcohol        7.245737e-01
3             red     3            chlorides        1.761946e+00
4             red     3          citric.acid       -3.276716e+00
5             red     3              density        2.449906e+00
6             red     3  free.sulfur.dioxide       -6.035430e-01
7             red     3       residual.sugar        9.097631e-01
8             red     3            sulphates        1.240524e-04
9             red     3 total.sulfur.dioxide       -2.467554e+00
10            red     3     volatile.acidity        1.300470e+00
11            red     4          (Intercept)       -1.000002e+00
12            red     4              alcohol       -7.920188e-07
13            red     4            chlorides       -2.589198e-08
14            red     4          citric.acid        9.340296e-08
15            red     4              density       -5.418190e-07
16            red     4  free.sulfur.dioxide       -6.981268e-08
17            red     4       residual.sugar        3.389558e-07
18            red     4            sulphates        1.417324e-07
19            red     4 total.sulfur.dioxide       -3.113900e-07
20            red     4     volatile.acidity        4.928625e-07
21            red     5          (Intercept)       -3.151406e-01
22            red     5              alcohol       -9.692192e-01
23            red     5            chlorides        3.690034e-02
24            red     5          citric.acid        2.258823e-01
25            red     5              density       -1.770474e-01
26            red     5  free.sulfur.dioxide       -1.289540e-01
27            red     5       residual.sugar        7.521771e-04
28            red     5            sulphates       -3.596548e-01
29            red     5 total.sulfur.dioxide        5.688280e-01
30            red     5     volatile.acidity        3.005168e-01
31            red     6          (Intercept)       -9.999994e-01
32            red     6              alcohol        8.807703e-07
33            red     6            chlorides        6.871310e-08
34            red     6          citric.acid       -4.525750e-07
35            red     6              density        5.786923e-07
36            red     6  free.sulfur.dioxide        3.856018e-07
37            red     6       residual.sugar       -4.281695e-07
38            red     6            sulphates        1.036468e-07
39            red     6 total.sulfur.dioxide       -4.287512e-07
40            red     6     volatile.acidity       -4.426151e-07
41            red     7          (Intercept)       -1.000000e+00
42            red     7              alcohol        1.761665e-07
43            red     7            chlorides       -3.583316e-08
44            red     7          citric.acid       -4.837739e-08
45            red     7              density        2.169500e-08
46            red     7  free.sulfur.dioxide        4.800717e-08
47            red     7       residual.sugar        1.909498e-08
48            red     7            sulphates        1.062205e-07
49            red     7 total.sulfur.dioxide       -2.339108e-07
50            red     7     volatile.acidity       -1.539326e-07
51            red     8          (Intercept)       -1.000000e+00
52            red     8              alcohol        7.089889e-08
53            red     8            chlorides       -8.566726e-09
54            red     8          citric.acid        2.769301e-08
55            red     8              density       -3.852321e-08
56            red     8  free.sulfur.dioxide       -1.302056e-08
57            red     8       residual.sugar        4.847947e-09
58            red     8            sulphates        1.276461e-08
59            red     8 total.sulfur.dioxide       -5.484427e-08
60            red     8     volatile.acidity        2.959182e-08

Parent topic: OML4R Classes That Provide Access to In-Database Machine Learning Algorithms