MultiIndex / Advanced indexing

https://pandas.pydata.org/docs/user_guide/advanced.html

import seaborn as sns
import pandas as pd
df_penguins = sns.load_dataset('penguins')

groupby and aggregation

# group by
grouped = df_penguins.groupby(['species', 'island', 'sex'])
grouped.agg(['mean', 'median'])
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean median mean median mean median mean median
species island sex
Adelie Biscoe Female 37.359091 37.75 17.704545 17.70 187.181818 187.0 3369.318182 3375.0
Male 40.590909 40.80 19.036364 18.90 190.409091 191.0 4050.000000 4000.0
Dream Female 36.911111 36.80 17.618519 17.80 187.851852 188.0 3344.444444 3400.0
Male 40.071429 40.25 18.839286 18.65 191.928571 190.5 4045.535714 3987.5
Torgersen Female 37.554167 37.60 17.550000 17.45 188.291667 189.0 3395.833333 3400.0
Male 40.586957 41.10 19.391304 19.20 194.913043 195.0 4034.782609 4000.0
Chinstrap Dream Female 46.573529 46.30 17.588235 17.65 191.735294 192.0 3527.205882 3550.0
Male 51.094118 50.95 19.252941 19.30 199.911765 200.5 3938.970588 3950.0
Gentoo Biscoe Female 45.563793 45.50 14.237931 14.25 212.706897 212.0 4679.741379 4700.0
Male 49.473770 49.50 15.718033 15.70 221.540984 221.0 5484.836066 5500.0

stack and unstack

grouped.agg(['mean', 'median']).unstack(level='species')
bill_length_mm bill_depth_mm ... flipper_length_mm body_mass_g
mean median mean median ... mean median mean median
species Adelie Chinstrap Gentoo Adelie Chinstrap Gentoo Adelie Chinstrap Gentoo Adelie ... Gentoo Adelie Chinstrap Gentoo Adelie Chinstrap Gentoo Adelie Chinstrap Gentoo
island sex
Biscoe Female 37.359091 NaN 45.563793 37.75 NaN 45.5 17.704545 NaN 14.237931 17.70 ... 212.706897 187.0 NaN 212.0 3369.318182 NaN 4679.741379 3375.0 NaN 4700.0
Male 40.590909 NaN 49.473770 40.80 NaN 49.5 19.036364 NaN 15.718033 18.90 ... 221.540984 191.0 NaN 221.0 4050.000000 NaN 5484.836066 4000.0 NaN 5500.0
Dream Female 36.911111 46.573529 NaN 36.80 46.30 NaN 17.618519 17.588235 NaN 17.80 ... NaN 188.0 192.0 NaN 3344.444444 3527.205882 NaN 3400.0 3550.0 NaN
Male 40.071429 51.094118 NaN 40.25 50.95 NaN 18.839286 19.252941 NaN 18.65 ... NaN 190.5 200.5 NaN 4045.535714 3938.970588 NaN 3987.5 3950.0 NaN
Torgersen Female 37.554167 NaN NaN 37.60 NaN NaN 17.550000 NaN NaN 17.45 ... NaN 189.0 NaN NaN 3395.833333 NaN NaN 3400.0 NaN NaN
Male 40.586957 NaN NaN 41.10 NaN NaN 19.391304 NaN NaN 19.20 ... NaN 195.0 NaN NaN 4034.782609 NaN NaN 4000.0 NaN NaN

6 rows × 24 columns

grouped.agg(['mean', 'median']).unstack().stack(level=2)
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean median mean median mean median mean median
species island sex
Adelie Biscoe Female 37.359091 37.75 17.704545 17.70 187.181818 187.0 3369.318182 3375.0
Male 40.590909 40.80 19.036364 18.90 190.409091 191.0 4050.000000 4000.0
Dream Female 36.911111 36.80 17.618519 17.80 187.851852 188.0 3344.444444 3400.0
Male 40.071429 40.25 18.839286 18.65 191.928571 190.5 4045.535714 3987.5
Torgersen Female 37.554167 37.60 17.550000 17.45 188.291667 189.0 3395.833333 3400.0
Male 40.586957 41.10 19.391304 19.20 194.913043 195.0 4034.782609 4000.0
Chinstrap Dream Female 46.573529 46.30 17.588235 17.65 191.735294 192.0 3527.205882 3550.0
Male 51.094118 50.95 19.252941 19.30 199.911765 200.5 3938.970588 3950.0
Gentoo Biscoe Female 45.563793 45.50 14.237931 14.25 212.706897 212.0 4679.741379 4700.0
Male 49.473770 49.50 15.718033 15.70 221.540984 221.0 5484.836066 5500.0
grouped.agg(['mean', 'median']).index
MultiIndex([(   'Adelie',    'Biscoe', 'Female'),
            (   'Adelie',    'Biscoe',   'Male'),
            (   'Adelie',     'Dream', 'Female'),
            (   'Adelie',     'Dream',   'Male'),
            (   'Adelie', 'Torgersen', 'Female'),
            (   'Adelie', 'Torgersen',   'Male'),
            ('Chinstrap',     'Dream', 'Female'),
            ('Chinstrap',     'Dream',   'Male'),
            (   'Gentoo',    'Biscoe', 'Female'),
            (   'Gentoo',    'Biscoe',   'Male')],
           names=['species', 'island', 'sex'])
grouped.agg(['mean', 'median']).index.values
array([('Adelie', 'Biscoe', 'Female'), ('Adelie', 'Biscoe', 'Male'),
       ('Adelie', 'Dream', 'Female'), ('Adelie', 'Dream', 'Male'),
       ('Adelie', 'Torgersen', 'Female'), ('Adelie', 'Torgersen', 'Male'),
       ('Chinstrap', 'Dream', 'Female'), ('Chinstrap', 'Dream', 'Male'),
       ('Gentoo', 'Biscoe', 'Female'), ('Gentoo', 'Biscoe', 'Male')],
      dtype=object)
grouped.agg(['mean', 'median']).index.get_level_values('sex')
Index(['Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male',
       'Female', 'Male'],
      dtype='object', name='sex')
grouped.agg(['mean', 'median']).index.get_level_values('island')
Index(['Biscoe', 'Biscoe', 'Dream', 'Dream', 'Torgersen', 'Torgersen', 'Dream',
       'Dream', 'Biscoe', 'Biscoe'],
      dtype='object', name='island')
grouped.agg(['mean', 'median']).index.levels
FrozenList([['Adelie', 'Chinstrap', 'Gentoo'], ['Biscoe', 'Dream', 'Torgersen'], ['Female', 'Male']])

Swapping levels, reordering levels

grouped = df_penguins.groupby(['species', 'island', 'sex'])
grouped.agg('mean')
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
species island sex
Adelie Biscoe Female 37.359091 17.704545 187.181818 3369.318182
Male 40.590909 19.036364 190.409091 4050.000000
Dream Female 36.911111 17.618519 187.851852 3344.444444
Male 40.071429 18.839286 191.928571 4045.535714
Torgersen Female 37.554167 17.550000 188.291667 3395.833333
Male 40.586957 19.391304 194.913043 4034.782609
Chinstrap Dream Female 46.573529 17.588235 191.735294 3527.205882
Male 51.094118 19.252941 199.911765 3938.970588
Gentoo Biscoe Female 45.563793 14.237931 212.706897 4679.741379
Male 49.473770 15.718033 221.540984 5484.836066
grouped.agg('mean').swaplevel("species", "island").swaplevel('island', 'sex').sort_index()
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
sex species island
Female Adelie Biscoe 37.359091 17.704545 187.181818 3369.318182
Dream 36.911111 17.618519 187.851852 3344.444444
Torgersen 37.554167 17.550000 188.291667 3395.833333
Chinstrap Dream 46.573529 17.588235 191.735294 3527.205882
Gentoo Biscoe 45.563793 14.237931 212.706897 4679.741379
Male Adelie Biscoe 40.590909 19.036364 190.409091 4050.000000
Dream 40.071429 18.839286 191.928571 4045.535714
Torgersen 40.586957 19.391304 194.913043 4034.782609
Chinstrap Dream 51.094118 19.252941 199.911765 3938.970588
Gentoo Biscoe 49.473770 15.718033 221.540984 5484.836066
grouped.agg('mean').reorder_levels([2, 1, 0], axis=0).sort_index()
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
sex island species
Female Biscoe Adelie 37.359091 17.704545 187.181818 3369.318182
Gentoo 45.563793 14.237931 212.706897 4679.741379
Dream Adelie 36.911111 17.618519 187.851852 3344.444444
Chinstrap 46.573529 17.588235 191.735294 3527.205882
Torgersen Adelie 37.554167 17.550000 188.291667 3395.833333
Male Biscoe Adelie 40.590909 19.036364 190.409091 4050.000000
Gentoo 49.473770 15.718033 221.540984 5484.836066
Dream Adelie 40.071429 18.839286 191.928571 4045.535714
Chinstrap 51.094118 19.252941 199.911765 3938.970588
Torgersen Adelie 40.586957 19.391304 194.913043 4034.782609

“partial” label identifying a subgroup in the data

grouped_df = grouped.agg(['mean', 'median'])
grouped_df
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean median mean median mean median mean median
species island sex
Adelie Biscoe Female 37.359091 37.75 17.704545 17.70 187.181818 187.0 3369.318182 3375.0
Male 40.590909 40.80 19.036364 18.90 190.409091 191.0 4050.000000 4000.0
Dream Female 36.911111 36.80 17.618519 17.80 187.851852 188.0 3344.444444 3400.0
Male 40.071429 40.25 18.839286 18.65 191.928571 190.5 4045.535714 3987.5
Torgersen Female 37.554167 37.60 17.550000 17.45 188.291667 189.0 3395.833333 3400.0
Male 40.586957 41.10 19.391304 19.20 194.913043 195.0 4034.782609 4000.0
Chinstrap Dream Female 46.573529 46.30 17.588235 17.65 191.735294 192.0 3527.205882 3550.0
Male 51.094118 50.95 19.252941 19.30 199.911765 200.5 3938.970588 3950.0
Gentoo Biscoe Female 45.563793 45.50 14.237931 14.25 212.706897 212.0 4679.741379 4700.0
Male 49.473770 49.50 15.718033 15.70 221.540984 221.0 5484.836066 5500.0
grouped_df.loc['Adelie', :]
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean median mean median mean median mean median
island sex
Biscoe Female 37.359091 37.75 17.704545 17.70 187.181818 187.0 3369.318182 3375.0
Male 40.590909 40.80 19.036364 18.90 190.409091 191.0 4050.000000 4000.0
Dream Female 36.911111 36.80 17.618519 17.80 187.851852 188.0 3344.444444 3400.0
Male 40.071429 40.25 18.839286 18.65 191.928571 190.5 4045.535714 3987.5
Torgersen Female 37.554167 37.60 17.550000 17.45 188.291667 189.0 3395.833333 3400.0
Male 40.586957 41.10 19.391304 19.20 194.913043 195.0 4034.782609 4000.0
grouped_df.loc['Adelie', "body_mass_g"] # works for the level 0 index on the rows or the columns
mean median
island sex
Biscoe Female 3369.318182 3375.0
Male 4050.000000 4000.0
Dream Female 3344.444444 3400.0
Male 4045.535714 3987.5
Torgersen Female 3395.833333 3400.0
Male 4034.782609 4000.0
grouped_df.loc['Adelie', "mean"] # works for the level 0 index on the rows or the columns
KeyError: 'mean'
grouped_df.loc[['Adelie', 'Gentoo'], ['flipper_length_mm', 'body_mass_g']]
flipper_length_mm body_mass_g
mean median mean median
species island sex
Adelie Biscoe Female 187.181818 187.0 3369.318182 3375.0
Male 190.409091 191.0 4050.000000 4000.0
Dream Female 187.851852 188.0 3344.444444 3400.0
Male 191.928571 190.5 4045.535714 3987.5
Torgersen Female 188.291667 189.0 3395.833333 3400.0
Male 194.913043 195.0 4034.782609 4000.0
Gentoo Biscoe Female 212.706897 212.0 4679.741379 4700.0
Male 221.540984 221.0 5484.836066 5500.0

Cross-sections: Using .xs to selet subset of rows at a given level

grouped_df.xs('Adelie', level=0) 
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean median mean median mean median mean median
island sex
Biscoe Female 37.359091 37.75 17.704545 17.70 187.181818 187.0 3369.318182 3375.0
Male 40.590909 40.80 19.036364 18.90 190.409091 191.0 4050.000000 4000.0
Dream Female 36.911111 36.80 17.618519 17.80 187.851852 188.0 3344.444444 3400.0
Male 40.071429 40.25 18.839286 18.65 191.928571 190.5 4045.535714 3987.5
Torgersen Female 37.554167 37.60 17.550000 17.45 188.291667 189.0 3395.833333 3400.0
Male 40.586957 41.10 19.391304 19.20 194.913043 195.0 4034.782609 4000.0
grouped_df.xs('Adelie', level=0, drop_level=False) 
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean median mean median mean median mean median
species island sex
Adelie Biscoe Female 37.359091 37.75 17.704545 17.70 187.181818 187.0 3369.318182 3375.0
Male 40.590909 40.80 19.036364 18.90 190.409091 191.0 4050.000000 4000.0
Dream Female 36.911111 36.80 17.618519 17.80 187.851852 188.0 3344.444444 3400.0
Male 40.071429 40.25 18.839286 18.65 191.928571 190.5 4045.535714 3987.5
Torgersen Female 37.554167 37.60 17.550000 17.45 188.291667 189.0 3395.833333 3400.0
Male 40.586957 41.10 19.391304 19.20 194.913043 195.0 4034.782609 4000.0
grouped_df.xs('Male', level=2, drop_level=False)
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean median mean median mean median mean median
species island sex
Adelie Biscoe Male 40.590909 40.80 19.036364 18.90 190.409091 191.0 4050.000000 4000.0
Dream Male 40.071429 40.25 18.839286 18.65 191.928571 190.5 4045.535714 3987.5
Torgersen Male 40.586957 41.10 19.391304 19.20 194.913043 195.0 4034.782609 4000.0
Chinstrap Dream Male 51.094118 50.95 19.252941 19.30 199.911765 200.5 3938.970588 3950.0
Gentoo Biscoe Male 49.473770 49.50 15.718033 15.70 221.540984 221.0 5484.836066 5500.0
grouped_df.xs(('Adelie', 'Dream'), level=(0, 1), drop_level=False)
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean median mean median mean median mean median
species island sex
Adelie Dream Female 36.911111 36.80 17.618519 17.80 187.851852 188.0 3344.444444 3400.0
Male 40.071429 40.25 18.839286 18.65 191.928571 190.5 4045.535714 3987.5
grouped_df.xs(('Adelie', 'Male'), level=(0, 2), drop_level=False)
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean median mean median mean median mean median
species island sex
Adelie Biscoe Male 40.590909 40.80 19.036364 18.90 190.409091 191.0 4050.000000 4000.0
Dream Male 40.071429 40.25 18.839286 18.65 191.928571 190.5 4045.535714 3987.5
Torgersen Male 40.586957 41.10 19.391304 19.20 194.913043 195.0 4034.782609 4000.0

using pd.indexSlice to use : and slices

grouped_df
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean median mean median mean median mean median
species island sex
Adelie Biscoe Female 37.359091 37.75 17.704545 17.70 187.181818 187.0 3369.318182 3375.0
Male 40.590909 40.80 19.036364 18.90 190.409091 191.0 4050.000000 4000.0
Dream Female 36.911111 36.80 17.618519 17.80 187.851852 188.0 3344.444444 3400.0
Male 40.071429 40.25 18.839286 18.65 191.928571 190.5 4045.535714 3987.5
Torgersen Female 37.554167 37.60 17.550000 17.45 188.291667 189.0 3395.833333 3400.0
Male 40.586957 41.10 19.391304 19.20 194.913043 195.0 4034.782609 4000.0
Chinstrap Dream Female 46.573529 46.30 17.588235 17.65 191.735294 192.0 3527.205882 3550.0
Male 51.094118 50.95 19.252941 19.30 199.911765 200.5 3938.970588 3950.0
Gentoo Biscoe Female 45.563793 45.50 14.237931 14.25 212.706897 212.0 4679.741379 4700.0
Male 49.473770 49.50 15.718033 15.70 221.540984 221.0 5484.836066 5500.0
idx = pd.IndexSlice
grouped_df.loc[
    idx[:, 'Biscoe', :], # rows
    idx[:, :] # columns
]
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean median mean median mean median mean median
species island sex
Adelie Biscoe Female 37.359091 37.75 17.704545 17.70 187.181818 187.0 3369.318182 3375.0
Male 40.590909 40.80 19.036364 18.90 190.409091 191.0 4050.000000 4000.0
Gentoo Biscoe Female 45.563793 45.50 14.237931 14.25 212.706897 212.0 4679.741379 4700.0
Male 49.473770 49.50 15.718033 15.70 221.540984 221.0 5484.836066 5500.0
grouped_df.loc[
    idx["Adelie", :, "Male"], # rows
    idx[["flipper_length_mm", "body_mass_g"], "median"] # columns
]
flipper_length_mm body_mass_g
median median
species island sex
Adelie Biscoe Male 191.0 4000.0
Dream Male 190.5 3987.5
Torgersen Male 195.0 4000.0

Sorting index for efficient slicing

subset_df = grouped_df.loc[idx[:, ['Dream', 'Biscoe'], ['Male']], idx[:, 'mean']]
subset_df
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean mean mean mean
species island sex
Adelie Dream Male 40.071429 18.839286 191.928571 4045.535714
Chinstrap Dream Male 51.094118 19.252941 199.911765 3938.970588
Adelie Biscoe Male 40.590909 19.036364 190.409091 4050.000000
Gentoo Biscoe Male 49.473770 15.718033 221.540984 5484.836066
subset_df.index.is_monotonic_increasing
False
subset_df.sort_index() # if not monotonic increasing, please sort it for me
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean mean mean mean
species island sex
Adelie Biscoe Male 40.590909 19.036364 190.409091 4050.000000
Dream Male 40.071429 18.839286 191.928571 4045.535714
Chinstrap Dream Male 51.094118 19.252941 199.911765 3938.970588
Gentoo Biscoe Male 49.473770 15.718033 221.540984 5484.836066
subset_df.loc(axis=0)["Adelie":"Chinstrap", :, :]
UnsortedIndexError: 'MultiIndex slicing requires the index to be lexsorted: slicing on levels [0], lexsort depth 0'
subset_df.sort_index().loc(axis=0)["Adelie":"Chinstrap", :, :]
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
mean mean mean mean
species island sex
Adelie Biscoe Male 40.590909 19.036364 190.409091 4050.000000
Dream Male 40.071429 18.839286 191.928571 4045.535714
Chinstrap Dream Male 51.094118 19.252941 199.911765 3938.970588
['a', 'b', 'c', 'd'][0:2]
['a', 'b']