{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas Notes - A Reference" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The dataset being used here is a sample Sales dataset." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "pd.set_option('display.max_rows', 500)\n", "\n", "df = pd.read_csv(\"./data/sales_data_sample.csv\", encoding='Latin-1')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Get rows with NaNs in any column" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ORDERNUMBERQUANTITYORDEREDPRICEEACHORDERLINENUMBERSALESORDERDATESTATUSQTR_IDMONTH_IDYEAR_ID...ADDRESSLINE1ADDRESSLINE2CITYSTATEPOSTALCODECOUNTRYTERRITORYCONTACTLASTNAMECONTACTFIRSTNAMEDEALSIZE
0101073095.7022871.002/24/2003 0:00Shipped122003...897 Long Airport AvenueNaNNYCNY10022USANaNYuKwaiSmall
1101213481.3552765.905/7/2003 0:00Shipped252003...59 rue de l'AbbayeNaNReimsNaN51100FranceEMEAHenriotPaulSmall
2101344194.7423884.347/1/2003 0:00Shipped372003...27 rue du Colonel Pierre AviaNaNParisNaN75508FranceEMEADa CunhaDanielMedium
3101454583.2663746.708/25/2003 0:00Shipped382003...78934 Hillside Dr.NaNPasadenaCA90003USANaNYoungJulieMedium
41015949100.00145205.2710/10/2003 0:00Shipped4102003...7734 Strong St.NaNSan FranciscoCANaNUSANaNBrownJulieMedium
..................................................................
28181035020100.00152244.4012/2/2004 0:00Shipped4122004...C/ Moralzarzal, 86NaNMadridNaN28034SpainEMEAFreyreDiegoSmall
28191037329100.0013978.511/31/2005 0:00Shipped112005...Torikatu 38NaNOuluNaN90110FinlandEMEAKoskitaloPirkkoMedium
28201038643100.0045417.573/1/2005 0:00Resolved132005...C/ Moralzarzal, 86NaNMadridNaN28034SpainEMEAFreyreDiegoMedium
2821103973462.2412116.163/28/2005 0:00Shipped132005...1 rue Alsace-LorraineNaNToulouseNaN31000FranceEMEARouletAnnetteSmall
2822104144765.5293079.445/6/2005 0:00On Hold252005...8616 Spinnaker Dr.NaNBostonMA51003USANaNYoshidoJuriMedium
\n", "

2676 rows × 25 columns

\n", "
" ], "text/plain": [ " ORDERNUMBER QUANTITYORDERED PRICEEACH ORDERLINENUMBER SALES \\\n", "0 10107 30 95.70 2 2871.00 \n", "1 10121 34 81.35 5 2765.90 \n", "2 10134 41 94.74 2 3884.34 \n", "3 10145 45 83.26 6 3746.70 \n", "4 10159 49 100.00 14 5205.27 \n", "... ... ... ... ... ... \n", "2818 10350 20 100.00 15 2244.40 \n", "2819 10373 29 100.00 1 3978.51 \n", "2820 10386 43 100.00 4 5417.57 \n", "2821 10397 34 62.24 1 2116.16 \n", "2822 10414 47 65.52 9 3079.44 \n", "\n", " ORDERDATE STATUS QTR_ID MONTH_ID YEAR_ID ... \\\n", "0 2/24/2003 0:00 Shipped 1 2 2003 ... \n", "1 5/7/2003 0:00 Shipped 2 5 2003 ... \n", "2 7/1/2003 0:00 Shipped 3 7 2003 ... \n", "3 8/25/2003 0:00 Shipped 3 8 2003 ... \n", "4 10/10/2003 0:00 Shipped 4 10 2003 ... \n", "... ... ... ... ... ... ... \n", "2818 12/2/2004 0:00 Shipped 4 12 2004 ... \n", "2819 1/31/2005 0:00 Shipped 1 1 2005 ... \n", "2820 3/1/2005 0:00 Resolved 1 3 2005 ... \n", "2821 3/28/2005 0:00 Shipped 1 3 2005 ... \n", "2822 5/6/2005 0:00 On Hold 2 5 2005 ... \n", "\n", " ADDRESSLINE1 ADDRESSLINE2 CITY STATE \\\n", "0 897 Long Airport Avenue NaN NYC NY \n", "1 59 rue de l'Abbaye NaN Reims NaN \n", "2 27 rue du Colonel Pierre Avia NaN Paris NaN \n", "3 78934 Hillside Dr. NaN Pasadena CA \n", "4 7734 Strong St. NaN San Francisco CA \n", "... ... ... ... ... \n", "2818 C/ Moralzarzal, 86 NaN Madrid NaN \n", "2819 Torikatu 38 NaN Oulu NaN \n", "2820 C/ Moralzarzal, 86 NaN Madrid NaN \n", "2821 1 rue Alsace-Lorraine NaN Toulouse NaN \n", "2822 8616 Spinnaker Dr. NaN Boston MA \n", "\n", " POSTALCODE COUNTRY TERRITORY CONTACTLASTNAME CONTACTFIRSTNAME DEALSIZE \n", "0 10022 USA NaN Yu Kwai Small \n", "1 51100 France EMEA Henriot Paul Small \n", "2 75508 France EMEA Da Cunha Daniel Medium \n", "3 90003 USA NaN Young Julie Medium \n", "4 NaN USA NaN Brown Julie Medium \n", "... ... ... ... ... ... ... \n", "2818 28034 Spain EMEA Freyre Diego Small \n", "2819 90110 Finland EMEA Koskitalo Pirkko Medium \n", "2820 28034 Spain EMEA Freyre Diego Medium \n", "2821 31000 France EMEA Roulet Annette Small \n", "2822 51003 USA NaN Yoshido Juri Medium \n", "\n", "[2676 rows x 25 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.isna().any(axis=1)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Check non-null counts and Dtypes per column" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 2823 entries, 0 to 2822\n", "Data columns (total 25 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 ORDERNUMBER 2823 non-null int64 \n", " 1 QUANTITYORDERED 2823 non-null int64 \n", " 2 PRICEEACH 2823 non-null float64\n", " 3 ORDERLINENUMBER 2823 non-null int64 \n", " 4 SALES 2823 non-null float64\n", " 5 ORDERDATE 2823 non-null object \n", " 6 STATUS 2823 non-null object \n", " 7 QTR_ID 2823 non-null int64 \n", " 8 MONTH_ID 2823 non-null int64 \n", " 9 YEAR_ID 2823 non-null int64 \n", " 10 PRODUCTLINE 2823 non-null object \n", " 11 MSRP 2823 non-null int64 \n", " 12 PRODUCTCODE 2823 non-null object \n", " 13 CUSTOMERNAME 2823 non-null object \n", " 14 PHONE 2823 non-null object \n", " 15 ADDRESSLINE1 2823 non-null object \n", " 16 ADDRESSLINE2 302 non-null object \n", " 17 CITY 2823 non-null object \n", " 18 STATE 1337 non-null object \n", " 19 POSTALCODE 2747 non-null object \n", " 20 COUNTRY 2823 non-null object \n", " 21 TERRITORY 1749 non-null object \n", " 22 CONTACTLASTNAME 2823 non-null object \n", " 23 CONTACTFIRSTNAME 2823 non-null object \n", " 24 DEALSIZE 2823 non-null object \n", "dtypes: float64(2), int64(7), object(16)\n", "memory usage: 551.5+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Get counts of number of Nulls per column" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
index0
0ORDERNUMBER0
1QUANTITYORDERED0
2PRICEEACH0
3ORDERLINENUMBER0
4SALES0
5ORDERDATE0
6STATUS0
7QTR_ID0
8MONTH_ID0
9YEAR_ID0
10PRODUCTLINE0
11MSRP0
12PRODUCTCODE0
13CUSTOMERNAME0
14PHONE0
15ADDRESSLINE10
16ADDRESSLINE22521
17CITY0
18STATE1486
19POSTALCODE76
20COUNTRY0
21TERRITORY1074
22CONTACTLASTNAME0
23CONTACTFIRSTNAME0
24DEALSIZE0
\n", "
" ], "text/plain": [ " index 0\n", "0 ORDERNUMBER 0\n", "1 QUANTITYORDERED 0\n", "2 PRICEEACH 0\n", "3 ORDERLINENUMBER 0\n", "4 SALES 0\n", "5 ORDERDATE 0\n", "6 STATUS 0\n", "7 QTR_ID 0\n", "8 MONTH_ID 0\n", "9 YEAR_ID 0\n", "10 PRODUCTLINE 0\n", "11 MSRP 0\n", "12 PRODUCTCODE 0\n", "13 CUSTOMERNAME 0\n", "14 PHONE 0\n", "15 ADDRESSLINE1 0\n", "16 ADDRESSLINE2 2521\n", "17 CITY 0\n", "18 STATE 1486\n", "19 POSTALCODE 76\n", "20 COUNTRY 0\n", "21 TERRITORY 1074\n", "22 CONTACTLASTNAME 0\n", "23 CONTACTFIRSTNAME 0\n", "24 DEALSIZE 0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum().reset_index()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Get dataframe excluding all NaNs in a particular Column" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "302" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "no_nan_addresses = df[df['ADDRESSLINE2'].notnull()]\n", "\n", "len(no_nan_addresses)" ] } ], "metadata": { "interpreter": { "hash": "033d5ea8e9748582193a6d8f975af35153e280c1f566336ac6ff582d76ae2a04" }, "kernelspec": { "display_name": "Python 3.6.8 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }