diff --git a/02_pandas.ipynb b/02_pandas.ipynb index 1ec7bc8eb5aa4b8c09db6607286138a625d1c8c7..5fd57fb668d407df91f84176a01b9c3e94ef2cf6 100644 --- a/02_pandas.ipynb +++ b/02_pandas.ipynb @@ -522,7 +522,7 @@ "source": [ "* `Series` represents a single column/vector/data series.\n", "* `DataFrame` consists of multiple `Series` (is a dictionary/list of row/column `Series`).\n", - "* `GroupBy` is created by grouping `DataFrame` rows by values of one of the column `Series`." + "* `DataFrameGroupBy` is created by grouping `DataFrame` rows by values of one of the column `Series`." ] }, { @@ -582,13 +582,6 @@ "execution_count": 13, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "<pandas.core.groupby.generic.DataFrameGroupBy object at 0x110573e80>\n" - ] - }, { "data": { "text/html": [ @@ -720,8 +713,6 @@ ], "source": [ "df_bygender = df[[\"Gender\", \"Weight\", \"Height\"]].groupby(\"Gender\")\n", - "# it's lazy!\n", - "print(df_bygender)\n", "df_bygender.describe()" ] }, @@ -736,13 +727,47 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "See [**Data Wrangling with pandas Cheat Sheet**](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf) for a great visual overview of ways of how tidy data frames can be transformed and summarized." + "See [**Data Wrangling with pandas Cheat Sheet**](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf) for a great visual overview of ways of how tidy data frames can be transformed and summarized.\n", + "\n", + "Notably, transfromations are lazy - they won't execute until actually needed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Grouping" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11436fa20>\n" + ] + } + ], + "source": [ + "df_bygender = df.groupby(\"Gender\")\n", + "print(df_bygender)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compare mean brain weight with mean weight per gender:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -760,7 +785,6 @@ "source": [ "print(df.Weight.mean())\n", "print()\n", - "df_bygender = df.groupby(\"Gender\")\n", "print(df_bygender.Weight.mean())" ] }, @@ -768,7 +792,55 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Plotting directly" + "Grouping is simply a collection of splitted data frames:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Female\n", + " FSIQ VIQ PIQ Weight Height MRI_Count\n", + "count 20.000000 20.000000 20.000000 20.000000 20.000000 20.00000\n", + "mean 111.900000 109.450000 110.450000 137.200000 65.765000 862654.60000\n", + "std 23.686327 21.670924 21.946046 16.953807 2.288248 55893.55578\n", + "min 77.000000 71.000000 72.000000 106.000000 62.000000 790619.00000\n", + "25% 90.250000 90.000000 93.000000 125.750000 64.500000 828062.00000\n", + "50% 115.500000 116.000000 115.000000 138.500000 66.000000 855365.00000\n", + "75% 133.000000 129.000000 128.750000 146.250000 66.875000 882668.50000\n", + "max 140.000000 136.000000 147.000000 175.000000 70.500000 991305.00000\n", + "\n", + "Male\n", + " FSIQ VIQ PIQ Weight Height MRI_Count\n", + "count 20.000000 20.000000 20.000000 18.000000 19.000000 2.000000e+01\n", + "mean 115.000000 115.250000 111.600000 166.444444 71.431579 9.548554e+05\n", + "std 24.986312 25.640993 23.540335 20.047656 3.283131 5.591135e+04\n", + "min 80.000000 77.000000 74.000000 132.000000 66.300000 8.799870e+05\n", + "25% 89.750000 95.250000 86.000000 148.750000 68.900000 9.195292e+05\n", + "50% 118.000000 110.500000 117.000000 172.000000 70.500000 9.472415e+05\n", + "75% 139.250000 145.000000 128.000000 180.750000 73.750000 9.734960e+05\n", + "max 144.000000 150.000000 150.000000 192.000000 77.000000 1.079549e+06\n", + "\n" + ] + } + ], + "source": [ + "for gender, gender_df in df_bygender:\n", + " print(gender)\n", + " print(gender_df.describe())\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotting" ] }, { @@ -780,16 +852,16 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "<matplotlib.axes._subplots.AxesSubplot at 0x1105732e8>" + "<matplotlib.axes._subplots.AxesSubplot at 0x11446bd30>" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, @@ -823,7 +895,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -887,12 +959,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - ".. or directly for the `GroupBy` objects:" + ".. or directly on the `DataFrameGroupBy` objects:" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -903,7 +975,7 @@ "dtype: object" ] }, - "execution_count": 17, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" },