4.1.6 Transform Data
In preparing data for analysis, a typical step is to transform data by reformatting it or deriving new columns and adding them to the data set.
The examples in this topic demonstrate two ways of formatting data and deriving columns.
Example 4-12 Formatting Data
This example creates a function to format the data in a column.
# Create a function for formatting data. petalCategory_fmt <- function(x) { ifelse(x > 5, 'LONG', ifelse(x > 2, 'MEDIUM', 'SMALL')) } # Create an ore.frame in database memory with the iris data set. iris_of <- ore.push(iris) # Select some rows from iris_of. iris_of[c(10, 20, 60, 80, 110, 140),] # Format the data in Petal.Length column. iris_of$Petal.Length <- petalCategory_fmt(iris_of$Petal.Length) # Select the same rows from iris_of.
Listing for This Example
R> # Create a function for formatting data. R> petalCategory_fmt <- function(x) { + ifelse(x > 5, 'LONG', + ifelse(x > 2, 'MEDIUM', 'SMALL')) + } R> # Create an ore.frame in database memory with the iris data set. R> iris_of <- ore.push(iris) R> # Select some rows from iris_of. R> iris_of[c(10, 20, 60, 80, 110, 140),] Sepal.Length Sepal.Width Petal.Length Petal.Width Species 10 4.9 3.1 1.5 0.1 setosa 20 5.1 3.8 1.5 0.3 setosa 60 5.2 2.7 3.9 1.4 versicolor 80 5.7 2.6 3.5 1.0 versicolor 110 7.2 3.6 6.1 2.5 virginica 140 6.9 3.1 5.4 2.1 virginica R> # Format the data in Petal.Length column. R> iris_of$Petal.Length <- petalCategory_fmt(iris_of$Petal.Length) R> # Select the same rows from iris_of. R> iris_of[c(10, 20, 60, 80, 110, 140),] Sepal.Length Sepal.Width Petal.Length Petal.Width Species 10 4.9 3.1 SMALL 0.1 setosa 20 5.1 3.8 SMALL 0.3 setosa 60 5.2 2.7 MEDIUM 1.4 versicolor 80 5.7 2.6 MEDIUM 1.0 versicolor 110 7.2 3.6 LONG 2.5 virginica 140 6.9 3.1 LONG 2.1 virginica
Example 4-13 Using the transform Function
This example does the same thing as the previous example except that it uses the transform
function to reformat the data in a column of the data set.
# Create an ore.frame in database memory with the iris data set. iris_of2 <- ore.push(iris) # Select some rows from iris_of. iris_of2[c(10, 20, 60, 80, 110, 140),] iris_of2 <- transform(iris_of2, Petal.Length = ifelse(Petal.Length > 5, 'LONG', ifelse(Petal.Length > 2, 'MEDIUM', 'SMALL'))) iris_of2[c(10, 20, 60, 80, 110, 140),]
Listing for This Example
R> # Create an ore.frame in database memory with the iris data set. R> iris_of2 <- ore.push(iris) R> # Select some rows from iris_of. R> iris_of2[c(10, 20, 60, 80, 110, 140),] Sepal.Length Sepal.Width Petal.Length Petal.Width Species 10 4.9 3.1 1.5 0.1 setosa 20 5.1 3.8 1.5 0.3 setosa 60 5.2 2.7 3.9 1.4 versicolor 80 5.7 2.6 3.5 1.0 versicolor 110 7.2 3.6 6.1 2.5 virginica 140 6.9 3.1 5.4 2.1 virginica R> iris_of2 <- transform(iris_of2, + Petal.Length = ifelse(Petal.Length > 5, 'LONG', + ifelse(Petal.Length > 2, 'MEDIUM', 'SMALL'))) R> iris_of2[c(10, 20, 60, 80, 110, 140),] Sepal.Length Sepal.Width Petal.Length Petal.Width Species 10 4.9 3.1 SMALL 0.1 setosa 20 5.1 3.8 SMALL 0.3 setosa 60 5.2 2.7 MEDIUM 1.4 versicolor 80 5.7 2.6 MEDIUM 1.0 versicolor 110 7.2 3.6 LONG 2.5 virginica 140 6.9 3.1 LONG 2.1 virginica
Example 4-14 Adding Derived Columns
This example uses the transform
function to add a derived column to the data set and then to add additional columns to it.
# Set the page width. options(width = 80) # Create an ore.frame in database memory with the iris data set. iris_of <- ore.push(iris) names(iris_of) # Add one column derived from another iris_of <- transform(iris_of, LOG_PL = log(Petal.Length)) names(iris_of) head(iris_of, 3) # Add more columns. iris_of <- transform(iris_of, SEPALBINS = ifelse(Sepal.Length < 6.0, "A", "B"), PRODUCTCOLUMN = Petal.Length * Petal.Width, CONSTANTCOLUMN = 10) names(iris_of) # Select some rows of iris_of. iris_of[c(10, 20, 60, 80, 110, 140),]
Listing for This Example
R> # Set the page width. R> options(width = 80) R> # Create an ore.frame in database memory with the iris data set. R> iris_of <- ore.push(iris) R> names(iris_of) [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" R> # Add one column derived from another R> iris_of <- transform(iris_of, LOG_PL = log(Petal.Length)) R> names(iris_of) [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" [6] "LOG_PL" R> head(iris_of, 3) Sepal.Length Sepal.Width Petal.Length Petal.Width Species LOG_PL 1 5.1 3.5 1.4 0.2 setosa 0.3364722 2 4.9 3.0 1.4 0.2 setosa 0.3364722 3 4.7 3.2 1.3 0.2 setosa 0.2623643 R> # Add more columns. R> iris_of <- transform(iris_of, SEPALBINS = ifelse(Sepal.Length < 6.0, "A", "B"), PRODUCTCOLUMN = Petal.Length * Petal.Width, CONSTANTCOLUMN = 10) R> names(iris_of) [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" [5] "Species" "LOG_PL" "CONSTANTCOLUMN" "SEPALBINS" [9] "PRODUCTCOLUMN" R> # Select some rows of iris_of. R> iris_of[c(10, 20, 60, 80, 110, 140),] Sepal.Length Sepal.Width Petal.Length Petal.Width Species LOG_PL 10 4.9 3.1 1.5 0.1 setosa 0.4054651 20 5.1 3.8 1.5 0.3 setosa 0.4054651 60 5.2 2.7 3.9 1.4 versicolor 1.3609766 80 5.7 2.6 3.5 1.0 versicolor 1.2527630 110 7.2 3.6 6.1 2.5 virginica 1.8082888 140 6.9 3.1 5.4 2.1 virginica 1.6863990 CONSTANTCOLUMN SEPALBINS PRODUCTCOLUMN 10 10 A 0.15 20 10 A 0.45 60 10 A 5.46 80 10 A 3.50 110 10 B 15.25 140 10 B 11.34