8.2.5 Sort Data
The sort_values
function enables flexible sorting of an oml.DataFrame
along one or more columns specified by the by argument, and returns an oml.DataFrame
.
Example 8-12 Sorting Data
The following example demonstrates these operations.
import oml
import pandas as pd
from sklearn import datasets
# Load the iris data set and create a pandas.DataFrame for it.
iris = datasets.load_iris()
x = pd.DataFrame(iris.data,
columns = ['Sepal_Length','Sepal_Width',
'Petal_Length','Petal_Width'])
y = pd.DataFrame(list(map(lambda x:
{0: 'setosa', 1: 'versicolor',
2:'virginica'}[x], iris.target)),
columns = ['Species'])
# Create the IRIS database table and the proxy object for the table.
oml_iris = oml.create(pd.concat([x, y], axis=1), table = 'IRIS')
# Modify the data set by replacing a few entries with NaNs to test
# how the na_position parameter works in the sort_values method.
Iris = oml_iris.pull()
Iris['Sepal_Width'].replace({3.5: None}, inplace=True)
Iris['Petal_Length'].replace({1.5: None}, inplace=True)
Iris['Petal_Width'].replace({2.3: None}, inplace=True)
# Create another table using the changed data.
oml_iris2 = oml.create(Iris, table = 'IRIS2')
# Sort the data set first by Sepal_Length then by Sepal_Width
# in descending order and display the first 5 rows of the
# sorted result.
oml_iris2.sort_values(by = ['Sepal_Length', 'Sepal_Width'],
ascending=False).head()
# Display the last 5 rows of the data set.
oml_iris2.tail()
# Sort the last 5 rows of the iris data set first by Petal_Length
# then by Petal_Width. By default, rows with NaNs are placed
# after the other rows when the sort keys are the same.
oml_iris2.tail().sort_values(by = ['Petal_Length', 'Petal_Width'])
# Sort the last 5 rows of the iris data set first by Petal_Length
# and then by Petal_Width. When the values in these two columns
# are the same, place the row with a NaN before the other row.
oml_iris2.tail().sort_values(by = ['Petal_Length', 'Petal_Width'],
na_position = 'first')
oml.drop('IRIS')
oml.drop('IRIS2')
Listing for This Example
>>> import oml
>>> import pandas as pd
>>> from sklearn import datasets
>>>
>>> # Load the iris data set and create a pandas.DataFrame for it.
... iris = datasets.load_iris()
>>> x = pd.DataFrame(iris.data,
... columns = ['Sepal_Length','Sepal_Width',
... 'Petal_Length','Petal_Width'])
>>> y = pd.DataFrame(list(map(lambda x:
... {0: 'setosa', 1: 'versicolor',
... 2:'virginica'}[x], iris.target)),
... columns = ['Species'])
>>>
>>> # Create the IRIS database table and the proxy object for the table.
... oml_iris = oml.create(pd.concat([x, y], axis=1), table = 'IRIS')
>>>
>>> # Modify the data set by replacing a few entries with NaNs to test
... # how the na_position parameter works in the sort_values method.
... Iris = oml_iris.pull()
>>> Iris['Sepal_Width'].replace({3.5: None}, inplace=True)
>>> Iris['Petal_Length'].replace({1.5: None}, inplace=True)
>>> Iris['Petal_Width'].replace({2.3: None}, inplace=True)
>>>
>>> # Create another table using the changed data.
... oml_iris2 = oml.create(Iris, table = 'IRIS2')
>>>
>>> # Sort the data set first by 'Sepal_Length' then by 'Sepal_Width'
... # in descending order and displays the first 5 rows of the
... # sorted result.
... oml_iris2.sort_values(by = ['Sepal_Length', 'Sepal_Width'],
... ascending=False).head()
Sepal_Length Sepal_Width Petal_Length Petal_Width Species
0 7.9 3.8 6.4 2.0 virginica
1 7.7 3.8 6.7 2.2 virginica
2 7.7 3.0 6.1 NaN virginica
3 7.7 2.8 6.7 2.0 virginica
4 7.7 2.6 6.9 NaN virginica
>>>
>>> # Display the last 5 rows of the data set.
... oml_iris2.tail()
Sepal_Length Sepal_Width Petal_Length Petal_Width Species
0 6.7 3.0 5.2 NaN virginica
1 6.3 2.5 5.0 1.9 virginica
2 6.5 3.0 5.2 2.0 virginica
3 6.2 3.4 5.4 NaN virginica
4 5.9 3.0 5.1 1.8 virginica
>>>
>>> # Sort the last 5 rows of the iris data set first by 'Petal_Length'
... # then by 'Petal_Width'. By default, rows with NaNs are placed
... # after the other rows when the sort keys are the same.
... oml_iris2.tail().sort_values(by = ['Petal_Length', 'Petal_Width'])
Sepal_Length Sepal_Width Petal_Length Petal_Width Species
0 6.3 2.5 5.0 1.9 virginica
1 5.9 3.0 5.1 1.8 virginica
2 6.5 3.0 5.2 2.0 virginica
3 6.7 3.0 5.2 NaN virginica
4 6.2 3.4 5.4 NaN virginica
>>>
>>> # Sort the last 5 rows of the iris data set first by 'Petal_Length'
... # and then by 'Petal_Width'. When the values in these two columns
... # are the same, place the row with a NaN before the other row.
... oml_iris2.tail().sort_values(by = ['Petal_Length', 'Petal_Width'],
... na_position = 'first')
Sepal_Length Sepal_Width Petal_Length Petal_Width Species
0 6.3 2.5 5.0 1.9 virginica
1 5.9 3.0 5.1 1.8 virginica
2 6.7 3.0 5.2 NaN virginica
3 6.5 3.0 5.2 2.0 virginica
4 6.2 3.4 5.4 NaN virginica
>>>
>>> oml.drop('IRIS')
>>> oml.drop('IRIS2')
Parent topic: Explore Data