|
5 | 5 | "metadata": {}, |
6 | 6 | "source": [ |
7 | 7 | "# Pandas tip #3: Transform .groupby() result back into original DataFrame\n", |
8 | | - "Sometime you need to get a statistic from a subgroup in a dataset and require this statistic to be in the original dataset. Previously I did this in multiple steps but this can also be achieved by a lesser known method in Pandas: `.transform()`.\n", |
| 8 | + "Sometime you need to get a statistic from a subgroup in a dataset and require this statistic to be in the original dataset. Previously I did this in multiple steps but this can also be achieved with a lesser known method in Pandas: `.transform()`.\n", |
9 | 9 | "\n", |
10 | | - "The `.transform()` method acts very similarly to the `.apply()` function and is especially powerfull after a `.groupby()`. It does a sort of `.apply()` on the groupby result and then transforms this into the length of the original DataFrame. Lets have a look a some artificial data:" |
| 10 | + "The `.transform()` method acts very similarly to the `.apply()` function and is especially powerful after a `.groupby()`. It does a sort of `.apply()` on the groupby result and then transforms this into the length of the original DataFrame. Lets have a look a some artificial data:" |
11 | 11 | ] |
12 | 12 | }, |
13 | 13 | { |
14 | 14 | "cell_type": "code", |
15 | | - "execution_count": 15, |
| 15 | + "execution_count": 33, |
16 | 16 | "metadata": {}, |
17 | 17 | "outputs": [], |
18 | 18 | "source": [ |
|
42 | 42 | }, |
43 | 43 | { |
44 | 44 | "cell_type": "code", |
45 | | - "execution_count": 17, |
| 45 | + "execution_count": 34, |
46 | 46 | "metadata": {}, |
47 | 47 | "outputs": [ |
48 | 48 | { |
|
55 | 55 | "Name: group, dtype: int64" |
56 | 56 | ] |
57 | 57 | }, |
58 | | - "execution_count": 17, |
| 58 | + "execution_count": 34, |
59 | 59 | "metadata": {}, |
60 | 60 | "output_type": "execute_result" |
61 | 61 | } |
|
74 | 74 | }, |
75 | 75 | { |
76 | 76 | "cell_type": "code", |
77 | | - "execution_count": 22, |
| 77 | + "execution_count": 35, |
78 | 78 | "metadata": {}, |
79 | 79 | "outputs": [ |
80 | 80 | { |
|
88 | 88 | "Name: spend_money, dtype: float64" |
89 | 89 | ] |
90 | 90 | }, |
91 | | - "execution_count": 22, |
| 91 | + "execution_count": 35, |
92 | 92 | "metadata": {}, |
93 | 93 | "output_type": "execute_result" |
94 | 94 | } |
|
109 | 109 | }, |
110 | 110 | { |
111 | 111 | "cell_type": "code", |
112 | | - "execution_count": 25, |
| 112 | + "execution_count": 36, |
113 | 113 | "metadata": {}, |
114 | 114 | "outputs": [ |
115 | 115 | { |
|
239 | 239 | "[100 rows x 4 columns]" |
240 | 240 | ] |
241 | 241 | }, |
242 | | - "execution_count": 25, |
| 242 | + "execution_count": 36, |
243 | 243 | "metadata": {}, |
244 | 244 | "output_type": "execute_result" |
245 | 245 | } |
|
261 | 261 | }, |
262 | 262 | { |
263 | 263 | "cell_type": "code", |
264 | | - "execution_count": 29, |
| 264 | + "execution_count": 37, |
265 | 265 | "metadata": {}, |
266 | 266 | "outputs": [ |
267 | 267 | { |
|
270 | 270 | "29.8" |
271 | 271 | ] |
272 | 272 | }, |
273 | | - "execution_count": 29, |
| 273 | + "execution_count": 37, |
274 | 274 | "metadata": {}, |
275 | 275 | "output_type": "execute_result" |
276 | 276 | } |
|
292 | 292 | }, |
293 | 293 | { |
294 | 294 | "cell_type": "code", |
295 | | - "execution_count": 31, |
| 295 | + "execution_count": 38, |
296 | 296 | "metadata": {}, |
297 | 297 | "outputs": [ |
298 | 298 | { |
|
320 | 320 | " <th>group</th>\n", |
321 | 321 | " <th>spend_money</th>\n", |
322 | 322 | " <th>group_mean</th>\n", |
323 | | - " <th>2nd_place</th>\n", |
| 323 | + " <th>2nd_lowest</th>\n", |
324 | 324 | " </tr>\n", |
325 | 325 | " </thead>\n", |
326 | 326 | " <tbody>\n", |
|
418 | 418 | "</div>" |
419 | 419 | ], |
420 | 420 | "text/plain": [ |
421 | | - " id group spend_money group_mean 2nd_place\n", |
422 | | - "0 0 A 25.01 504.031724 29.80\n", |
423 | | - "1 1 C 244.89 530.030909 111.55\n", |
424 | | - "2 2 B 736.47 506.783600 94.33\n", |
425 | | - "3 3 A 590.49 504.031724 29.80\n", |
426 | | - "4 4 A 29.80 504.031724 29.80\n", |
427 | | - ".. .. ... ... ... ...\n", |
428 | | - "95 95 C 800.59 530.030909 111.55\n", |
429 | | - "96 96 A 248.66 504.031724 29.80\n", |
430 | | - "97 97 B 536.29 506.783600 94.33\n", |
431 | | - "98 98 B 421.88 506.783600 94.33\n", |
432 | | - "99 99 C 462.63 530.030909 111.55\n", |
| 421 | + " id group spend_money group_mean 2nd_lowest\n", |
| 422 | + "0 0 A 25.01 504.031724 29.80\n", |
| 423 | + "1 1 C 244.89 530.030909 111.55\n", |
| 424 | + "2 2 B 736.47 506.783600 94.33\n", |
| 425 | + "3 3 A 590.49 504.031724 29.80\n", |
| 426 | + "4 4 A 29.80 504.031724 29.80\n", |
| 427 | + ".. .. ... ... ... ...\n", |
| 428 | + "95 95 C 800.59 530.030909 111.55\n", |
| 429 | + "96 96 A 248.66 504.031724 29.80\n", |
| 430 | + "97 97 B 536.29 506.783600 94.33\n", |
| 431 | + "98 98 B 421.88 506.783600 94.33\n", |
| 432 | + "99 99 C 462.63 530.030909 111.55\n", |
433 | 433 | "\n", |
434 | 434 | "[100 rows x 5 columns]" |
435 | 435 | ] |
436 | 436 | }, |
437 | | - "execution_count": 31, |
| 437 | + "execution_count": 38, |
438 | 438 | "metadata": {}, |
439 | 439 | "output_type": "execute_result" |
440 | 440 | } |
|
446 | 446 | " else:\n", |
447 | 447 | " return None\n", |
448 | 448 | "\n", |
449 | | - "df['2nd_place'] = (df\n", |
| 449 | + "df['2nd_lowest'] = (df\n", |
450 | 450 | " .groupby('group')['spend_money']\n", |
451 | 451 | " .transform(second_from_group)\n", |
452 | 452 | ")\n", |
|
0 commit comments