some grouping tricks

some grouping tricks#

import numpy as np
import pandas as pd

dataset 1#

we have the following dataset, these are the results of a survey there is one line per answer, and each person participating has answered 3 questions

what is the overall rating of the trademark - from 1 to 4
what are the most-liked and least-liked sites (among a finite list)

df1 = pd.read_excel('data/groupby-ratings.xlsx')
df1.head()

	rating	favorite	least-liked
0	1	london	oxford
1	2	london	oxford
2	3	london	oxford
3	4	london	oxford
4	4	oxford	oxford

objective#

we want to compute, for each rating, the number of occurrences of each site among the favorite and least-liked columns

# here the dataset is small

grouped = df1.groupby('rating')
grouped.size()

rating
1    4
2    5
3    5
4    4
dtype: int64

this is a good opportunity to see that when calling GroupBy.aggregrate, one can pass a dictionary, that says how to deal with each column

# if you need to see the doc, uncomment this line
# grouped.aggregate?

# note that this is rather fragile though, 
# any variation around that theme is likely to break
# e.g. replacing one value_counts with something like 'sum' issues a RuntimeWarning...

counts = grouped.aggregate({
    # this refers to Series.values_count()
    'favorite':    ['value_counts'],
    'least-liked': ['value_counts'], 
})
counts

		favorite	least-liked
		value_counts	value_counts
rating
1	oxford	2.0	1.0
	aberdeen	1.0	2.0
	london	1.0	1.0
2	aberdeen	2.0	1.0
	oxford	2.0	2.0
	london	1.0	2.0
3	aberdeen	4.0	NaN
3	london	1.0	4.0
4	oxford	2.0	2.0
	aberdeen	1.0	1.0
	london	1.0	1.0
3	oxford	NaN	1.0

how to understand this ? e.g. the first line means that:

among the people who gave a rating of 1:
- 2 have mentioned oxford as their favorite,
- and 1 have mentioned oxford as their least-liked

# here again, the resulting dataframe has a multi-index for both horizontal and vertical dimensions

len(counts.index.levels), len(counts.columns.levels)

(2, 2)

# what's the performance of 'london' among the people who gave a rating of 3

counts.loc[(3, 'london')] 

favorite     value_counts    1.0
least-liked  value_counts    4.0
Name: (3, london), dtype: float64

dataset 2#

df2 = pd.DataFrame(
    {
         "A": ["foo", "far", "foo", "bar", "foo", "bar", "foo", "foo"],
         "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
         "C": np.arange(1, 9),
         "D": 10-np.arange(1, 9),
     })
df2 

	A	B	C	D
0	foo	one	1	9
1	far	one	2	8
2	foo	two	3	7
3	bar	three	4	6
4	foo	two	5	5
5	bar	two	6	4
6	foo	one	7	3
7	foo	three	8	2

simple groupings#

we consider two groupings

by A only
by A and B

for each of these groupings, compute

the mean and min for C
the max for D

# single criteria
grouped_a = df2.groupby('A')

# dual criteria
grouped_ab = df2.groupby(['A', 'B'])

here again, calling aggregate with a dictionary comes in handy

# remember that aggregate can be shortened into just 'agg' 

grouped_a.agg({'C': ['mean', 'min'], 'D': 'max'})

	C		D
	mean	min	max
A
bar	5.0	4	6
far	2.0	2	8
foo	4.8	1	9

# same on the dual-grouping
grouped_ab.agg({'C': ['mean', 'min'], 'D': 'max'})

		C		D
		mean	min	max
A	B
bar	three	4.0	4	6
bar	two	6.0	6	4
far	one	2.0	2	8
foo	one	4.0	1	9
	three	8.0	8	2
	two	4.0	3	7

inspecting a Groupby object#

# how many items in each group

grouped_a.size()

A
bar    2
far    1
foo    5
dtype: int64

# more details of what's in each group; only the indexes show up
grouped_a.groups

{'bar': [3, 5], 'far': [1], 'foo': [0, 2, 4, 6, 7]}

# here the keys are 2-tuples because we have grouped on 2 criteria
grouped_ab.groups

{('bar', 'three'): [3], ('bar', 'two'): [5], ('far', 'one'): [1], ('foo', 'one'): [0, 6], ('foo', 'three'): [7], ('foo', 'two'): [2, 4]}

# one value for the key gives access to one dataframe 
grouped_a.get_group('foo')

	A	B	C	D
0	foo	one	1	9
2	foo	two	3	7
4	foo	two	5	5
6	foo	one	7	3
7	foo	three	8	2

# same here
grouped_ab.get_group(('foo', 'two'))

	A	B	C	D
2	foo	two	3	7
4	foo	two	5	5

grouping from a `Series`#

for advanced users

# we could even group based on finer-grained criteria

# here just on the first letter of the 'A' column
# so the 'foo' and 'far' values are grouped together
grouped_a1c = df2.groupby(df2['A'].str[0])

grouped_a1c.groups

{'b': [3, 5], 'f': [0, 1, 2, 4, 6, 7]}

# explanation: here the parameter to groupby is a Series
df2['A'].str[0]

  f
  f
  f
  b
  f
  b
  f
  f
Name: A, dtype: object

using a function to decide on the grouping#

this means we can group arbitrarily; for example we want to group in two categories, whether the ‘B’ column contains a o or not

df2.B.unique()

array(['one', 'two', 'three'], dtype=object)

so this should give two groups, one the one hand the rows where B is among one and two, on the other hand the ones where B is three

# here too this is a finer-grained criteria
# and this time we define it from a function
# here we group together the lines where the 'B' cell has a 'o'
# which results in 2 groups, 'one' and 'two' in group1, and 'three' in group2

grouped_bo = df2.groupby(df2['B'].apply(lambda b: 'o' in b))

# the way we have written this, 'True' mean 'o' is in the 'B' column
grouped_bo.size()

B
False    2
True     6
dtype: int64

using 2 functions to decide on the grouping#

this can be extended arbitrarily; we could add a grouping in a separate dimension, whether the ‘A’ columns contains ‘f’or not

df2.A.unique()

array(['foo', 'far', 'bar'], dtype=object)

criteria = df2.apply(lambda row: ('f' in row.A, 'o' in row.B), axis=1)

grouped_af_bo = df2.groupby(criteria)

# not very legible
grouped_af_bo.size()

(False, False)    1
(False, True)     1
(True, False)     1
(True, True)      5
dtype: int64

# a little nicer

grouped_af_bo = df2.groupby(
    df2.apply(
        lambda row: ('f in A' if 'f' in row.A else 'nope', 'o in B' if 'o' in row.B else 'nope'), axis=1))

# again
grouped_af_bo.size()

(f in A, nope)      1
(f in A, o in B)    5
(nope, nope)        1
(nope, o in B)      1
dtype: int64

groupby using buckets#

# we could even group by buckets
# so here the values in C are split in 3 groups which should have the same size
# (but because we have 8 lines, not a multiple of 3, it ends up in 3 + 2 + 3
grouped_auto_buckets = df2.groupby(
    pd.qcut(x=df2['C'], q=3, labels=['low', 'mid', 'high']))
grouped_auto_buckets.size()

/tmp/ipykernel_1439/2557907554.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_auto_buckets = df2.groupby(

C
low     3
mid     2
high    3
dtype: int64

# or we could define the boundaries ourselves
grouped_buckets = df2.groupby(pd.cut(df2['C'], [0, 3.5, 6.5, 10.]))
grouped_buckets.size()

/tmp/ipykernel_1439/2978047166.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_buckets = df2.groupby(pd.cut(df2['C'], [0, 3.5, 6.5, 10.]))

C
(0.0, 3.5]     3
(3.5, 6.5]     3
(6.5, 10.0]    2
dtype: int64