8.2.4 Mutate Data
In preparing data for analysis, a typical operation is to mutate data by reformatting it or deriving new columns and adding them to the data set.
These examples demonstrate methods of formatting data and deriving columns.
import pandas as pd
import oml
# Create a shopping cart data set.
shopping_cart = pd.DataFrame({
'Item_name': ['paper_towel', 'ground_pork', 'tofu', 'eggs',
'pork_loin', 'whole_milk', 'egg_custard'],
'Item_type': ['grocery', 'meat', 'grocery', 'dairy', 'meat',
'dairy', 'bakery'],
'Quantity': [1, 2.6, 4, 1, 1.9, 1, 1],
'Unit_price': [1.19, 2.79, 0.99, 2.49, 3.19, 2.5, 3.99]
})
oml_cart = oml.push(shopping_cart)
oml_cart
# Add a column 'Price' multiplying 'Quantity' with 'Unit_price',
# rounded to 2 decimal places.
price = oml_cart['Quantity']*(oml_cart['Unit_price'])
type(price)
price
oml_cart = oml_cart.concat({'Price': price.round(2)})
# Count the pattern 'egg' in the 'Item_name' column.
egg_pattern = oml_cart['Item_name'].count_pattern('egg')
type(egg_pattern)
oml_cart.concat({'Egg_pattern': egg_pattern})
# Find the start index of substring 'pork' in the 'Item_name' column.
pork_startInd = oml_cart['Item_name'].find('pork')
type(pork_startInd)
oml_cart.concat({'Pork_startInd': pork_startInd})
# Check whether items are of grocery category.
is_grocery=oml_cart['Item_type']=='grocery'
type(is_grocery)
oml_cart.concat({'Is_grocery': is_grocery})
# Calculate the length of item names.
name_length=oml_cart['Item_name'].len()
type(name_length)
oml_cart.concat({'Name_length': name_length})
# Get the ceiling, floor, exponential, logarithm and square root
# of the 'Price' column.
oml_cart['Price'].ceil()
oml_cart['Price'].floor()
oml_cart['Price'].exp()
oml_cart['Price'].log()
oml_cart['Price'].sqrt()
Listing for This Example
>>> import pandas as pd
>>> import oml
>>>
>>> # Create a shopping cart data set.
... shopping_cart = pd.DataFrame({
... 'Item_name': ['paper_towel', 'ground_pork', 'tofu', 'eggs',
... 'pork_loin', 'whole_milk', 'egg_custard'],
... 'Item_type': ['grocery', 'meat', 'grocery', 'dairy', 'meat',
... 'dairy', 'bakery'],
... 'Quantity': [1, 2.6, 4, 1, 1.9, 1, 1],
... 'Unit_price': [1.19, 2.79, 0.99, 2.49, 3.19, 2.5, 3.99]
... })
>>> oml_cart = oml.push(shopping_cart)
>>> oml_cart
Item_name Item_type Quantity Unit_price
0 paper_towel grocery 1.0 1.19
1 ground_pork meat 2.6 2.79
2 tofu grocery 4.0 0.99
3 eggs dairy 1.0 2.49
4 pork_loin meat 1.9 3.19
5 whole_milk dairy 1.0 2.50
6 egg_custard bakery 1.0 3.99
>>>
>>> # Add a column 'Price' multiplying 'Quantity' with 'Unit_price',
... # rounded to 2 decimal places.
... price = oml_cart['Quantity']*(oml_cart['Unit_price'])
>>> type(price)
<class 'oml.core.float.Float'>
>>> price
[1.19, 7.254, 3.96, 2.49, 6.061, 2.5, 3.99]
>>> oml_cart = oml_cart.concat({'Price': price.round(2)})
>>>
>>> # Count the pattern 'egg' in the 'Item_name' column.
... egg_pattern = oml_cart['Item_name'].count_pattern('egg')
>>> type(egg_pattern)
<class 'oml.core.float.Float'>
>>> oml_cart.concat({'Egg_pattern': egg_pattern})
Item_name Item_type Quantity Unit_price Price Egg_pattern
0 paper_towel grocery 1.0 1.19 1.19 0
1 ground_pork meat 2.6 2.79 7.25 0
2 tofu grocery 4.0 0.99 3.96 0
3 eggs dairy 1.0 2.49 2.49 1
4 pork_loin meat 1.9 3.19 6.06 0
5 whole_milk dairy 1.0 2.50 2.50 0
6 egg_custard bakery 1.0 3.99 3.99 1
>>>
>>> # Find the start index of substring 'pork' in the 'Item_name' column.
... pork_startInd = oml_cart['Item_name'].find('pork')
>>> type(pork_startInd)
<class 'oml.core.float.Float'>
>>> oml_cart.concat({'Pork_startInd': pork_startInd})
Item_name Item_type Quantity Unit_price Price Pork_startInd
0 paper_towel grocery 1.0 1.19 1.19 -1
1 ground_pork meat 2.6 2.79 7.25 7
2 tofu grocery 4.0 0.99 3.96 -1
3 eggs dairy 1.0 2.49 2.49 -1
4 pork_loin meat 1.9 3.19 6.06 0
5 whole_milk dairy 1.0 2.50 2.50 -1
6 egg_custard bakery 1.0 3.99 3.99 -1
>>>
>>> # Check whether items are of grocery category.
... is_grocery=oml_cart['Item_type']=='grocery'
>>> type(is_grocery)
<class 'oml.core.boolean.Boolean'>
>>> oml_cart.concat({'Is_grocery': is_grocery})
Item_name Item_type Quantity Unit_price Price Is_grocery
0 paper_towel grocery 1.0 1.19 1.19 True
1 ground_pork meat 2.6 2.79 7.25 False
2 tofu grocery 4.0 0.99 3.96 True
3 eggs dairy 1.0 2.49 2.49 False
4 pork_loin meat 1.9 3.19 6.06 False
5 whole_milk dairy 1.0 2.50 2.50 False
6 egg_custard bakery 1.0 3.99 3.99 False
>>>
>>> # Calculate the length of item names.
... name_length=oml_cart['Item_name'].len()
>>> type(name_length)
<class 'oml.core.float.Float'>
>>> oml_cart.concat({'Name_length': name_length})
Item_name Item_type Quantity Unit_price Price Name_length
0 paper_towel grocery 1.0 1.19 1.19 11
1 ground_pork meat 2.6 2.79 7.25 11
2 tofu grocery 4.0 0.99 3.96 4
3 eggs dairy 1.0 2.49 2.49 4
4 pork_loin meat 1.9 3.19 6.06 9
5 whole_milk dairy 1.0 2.50 2.50 10
6 egg_custard bakery 1.0 3.99 3.99 11
>>>
>>> # Get the ceiling, floor, exponential, logarithm and square root
... # of the 'Price' column.
... oml_cart['Price'].ceil()
[2, 8, 4, 3, 7, 3, 4]
>>> oml_cart['Price'].floor()
[1, 7, 3, 2, 6, 2, 3]
>>> oml_cart['Price'].exp()
[3.2870812073831184, 1408.1048482046956, 52.45732594909905, 12.061276120444719, 428.37543685928694, 12.182493960703473, 54.05488936332659]
>>> oml_cart['Price'].log()
[0.173953307123438, 1.9810014688665833, 1.3762440252663892, 0.9122827104766162, 1.801709800081223, 0.9162907318741551, 1.3837912309017721]
>>> oml_cart['Price'].sqrt()
[1.0908712114635715, 2.692582403567252, 1.98997487421324, 1.57797338380595, 2.4617067250182343, 1.5811388300841898, 1.997498435543818]
Parent topic: Explore Data