Lec 13 - Enriching your dataframe with external information and join/merge

import seaborn as sns
import pandas as pd

# This is a comment in python
# Another comment

penguins = sns.load_dataset('penguins')
penguins.head()

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	NaN	NaN	NaN	NaN	NaN
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female

# third comment

geo_df = pd.DataFrame([
    {'island name': 'Torgersen', 'lat': -64.766667, 'lon': -64.083333},
    {'island name': 'Biscoe', 'lat': -65.4333, 'lon': -65.5000},
    {'island name': 'Dream', 'lat':  -64.7333, 'lon': -64.2333}
    ])
geo_df

	island name	lat	lon
0	Torgersen	-64.766667	-64.083333
1	Biscoe	-65.433300	-65.500000
2	Dream	-64.733300	-64.233300

import plotly.express as px
fig = px.scatter_geo(geo_df,
                     lat='lat', lon='lon',
                     hover_name="island name",
                     fitbounds="locations")

fig.show()

merged_df = penguins.merge(geo_df, left_on='island', right_on='island name')
merged_df.head()

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex	island name	lat	lon
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male	Torgersen	-64.766667	-64.083333
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female	Torgersen	-64.766667	-64.083333
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female	Torgersen	-64.766667	-64.083333
3	Adelie	Torgersen	NaN	NaN	NaN	NaN	NaN	Torgersen	-64.766667	-64.083333
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female	Torgersen	-64.766667	-64.083333

df_agg = merged_df.groupby('island')[['body_mass_g', 'flipper_length_mm', 'lat', 'lon']].agg('mean')
df_agg

	body_mass_g	flipper_length_mm	lat	lon
island
Biscoe	4716.017964	209.706587	-65.433300	-65.500000
Dream	3712.903226	193.072581	-64.733300	-64.233300
Torgersen	3706.372549	191.196078	-64.766667	-64.083333

df_agg['island'] = df_agg.index
fig = px.scatter_geo(df_agg,
                     lat='lat', lon='lon',
                     size='body_mass_g',
                     color='flipper_length_mm',
                     hover_name="island",
                     fitbounds="locations")

fig.show()

Different types of join

# Little helper to print dataframes side by side
# https://stackoverflow.com/a/44923103/13430450
from IPython.display import display_html
from itertools import chain,cycle
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

left = pd.DataFrame(
    {
        "key": ["K0", "K1", "K2", "K3"],
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
    })
right = pd.DataFrame(
    {
        "key": ["K0", "K1", "K2", "K3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    })
display_side_by_side(left,
                     right,
                     pd.merge(left, right, on="key", indicator=True),
                     titles=['left', 'right', 'merged'])

left

	key	A	B
0	K0	A0	B0
1	K1	A1	B1
2	K2	A2	B2
3	K3	A3	B3

right

	key	C	D
0	K0	C0	D0
1	K1	C1	D1
2	K2	C2	D2
3	K3	C3	D3

merged

	key	A	B	C	D	_merge
0	K0	A0	B0	C0	D0	both
1	K1	A1	B1	C1	D1	both
2	K2	A2	B2	C2	D2	both
3	K3	A3	B3	C3	D3	both

Left merge: Use keys from left frame only (in SQL, “LEFT OUTER JOIN”)

left = pd.DataFrame(
    {
        "key1": ["K0", "K0", "K1", "K2"],
        "key2": ["K0", "K1", "K0", "K1"],
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
    }
)
right = pd.DataFrame(
    {
        "key1": ["K0", "K1", "K1", "K2"],
        "key2": ["K0", "K0", "K0", "K0"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    }
)
display_side_by_side(left,
                     right,
                     pd.merge(left,
                              right,
                              how="left",
                              on=["key1", "key2"],
                              indicator=True
                              ),
                     titles=['left', 'right', 'merged (how=left)'])

left

	key1	key2	A	B
0	K0	K0	A0	B0
1	K0	K1	A1	B1
2	K1	K0	A2	B2
3	K2	K1	A3	B3

right

	key1	key2	C	D
0	K0	K0	C0	D0
1	K1	K0	C1	D1
2	K1	K0	C2	D2
3	K2	K0	C3	D3

merged (how=left)

	key1	key2	A	B	C	D	_merge
0	K0	K0	A0	B0	C0	D0	both
1	K0	K1	A1	B1	NaN	NaN	left_only
2	K1	K0	A2	B2	C1	D1	both
3	K1	K0	A2	B2	C2	D2	both
4	K2	K1	A3	B3	NaN	NaN	left_only

Right merge: Use keys from right frame only (in SQL, “RIGHT OUTER JOIN”)

display_side_by_side(left,
                     right,
                     pd.merge(left, right, how="right", on=["key1", "key2"], indicator=True),
                     titles=['left', 'right', 'merged (how=right)'])

left

	key1	key2	A	B
0	K0	K0	A0	B0
1	K0	K1	A1	B1
2	K1	K0	A2	B2
3	K2	K1	A3	B3

right

	key1	key2	C	D
0	K0	K0	C0	D0
1	K1	K0	C1	D1
2	K1	K0	C2	D2
3	K2	K0	C3	D3

merged (how=right)

	key1	key2	A	B	C	D	_merge
0	K0	K0	A0	B0	C0	D0	both
1	K1	K0	A2	B2	C1	D1	both
2	K1	K0	A2	B2	C2	D2	both
3	K2	K0	NaN	NaN	C3	D3	right_only

Outer join: Use union of keys from both frames (“in SQL,”FULL OUTER JOIN”)

display_side_by_side(left,
                     right,
                     pd.merge(left, right, how="outer", on=["key1", "key2"],  indicator=True),
                     titles=['left', 'right', 'merged (how=outer)'])

left

	key1	key2	A	B
0	K0	K0	A0	B0
1	K0	K1	A1	B1
2	K1	K0	A2	B2
3	K2	K1	A3	B3

right

	key1	key2	C	D
0	K0	K0	C0	D0
1	K1	K0	C1	D1
2	K1	K0	C2	D2
3	K2	K0	C3	D3

merged (how=outer)

	key1	key2	A	B	C	D	_merge
0	K0	K0	A0	B0	C0	D0	both
1	K0	K1	A1	B1	NaN	NaN	left_only
2	K1	K0	A2	B2	C1	D1	both
3	K1	K0	A2	B2	C2	D2	both
4	K2	K1	A3	B3	NaN	NaN	left_only
5	K2	K0	NaN	NaN	C3	D3	right_only

Inner join: Use intersection of keys from both frames (in SQL, “INNER JOIN”)

display_side_by_side(left,
                     right,
                     pd.merge(left, right, how="inner", on=["key1", "key2"], indicator=True),
                     titles=['left', 'right', 'merged (how=inner)'])

left

	key1	key2	A	B
0	K0	K0	A0	B0
1	K0	K1	A1	B1
2	K1	K0	A2	B2
3	K2	K1	A3	B3

right

	key1	key2	C	D
0	K0	K0	C0	D0
1	K1	K0	C1	D1
2	K1	K0	C2	D2
3	K2	K0	C3	D3

merged (how=inner)

	key1	key2	A	B	C	D	_merge
0	K0	K0	A0	B0	C0	D0	both
1	K1	K0	A2	B2	C1	D1	both
2	K1	K0	A2	B2	C2	D2	both

`how='cross'`: Create the cartesian product of rows of both frames

display_side_by_side(left,
                     right,
                     pd.merge(left, right, how="cross", suffixes=('_x', '_y')),                     
                     titles=['left', 'right', 'merged (how=cross)'])

left

	key1	key2	A	B
0	K0	K0	A0	B0
1	K0	K1	A1	B1
2	K1	K0	A2	B2
3	K2	K1	A3	B3

right

	key1	key2	C	D
0	K0	K0	C0	D0
1	K1	K0	C1	D1
2	K1	K0	C2	D2
3	K2	K0	C3	D3

merged (how=cross)

	key1_x	key2_x	A	B	key1_y	key2_y	C	D
0	K0	K0	A0	B0	K0	K0	C0	D0
1	K0	K0	A0	B0	K1	K0	C1	D1
2	K0	K0	A0	B0	K1	K0	C2	D2
3	K0	K0	A0	B0	K2	K0	C3	D3
4	K0	K1	A1	B1	K0	K0	C0	D0
5	K0	K1	A1	B1	K1	K0	C1	D1
6	K0	K1	A1	B1	K1	K0	C2	D2
7	K0	K1	A1	B1	K2	K0	C3	D3
8	K1	K0	A2	B2	K0	K0	C0	D0
9	K1	K0	A2	B2	K1	K0	C1	D1
10	K1	K0	A2	B2	K1	K0	C2	D2
11	K1	K0	A2	B2	K2	K0	C3	D3
12	K2	K1	A3	B3	K0	K0	C0	D0
13	K2	K1	A3	B3	K1	K0	C1	D1
14	K2	K1	A3	B3	K1	K0	C2	D2
15	K2	K1	A3	B3	K2	K0	C3	D3

Another application: Enrich our forex rates dataframe with market data

rates = pd.read_csv('rates.csv', parse_dates=['Time'])
rates.head()

	Time	USD	JPY	BGN	CZK	DKK	GBP	CHF
0	2024-04-16	1.0637	164.54	1.9558	25.210	7.4609	0.85440	0.9712
1	2024-04-15	1.0656	164.05	1.9558	25.324	7.4606	0.85405	0.9725
2	2024-04-12	1.0652	163.16	1.9558	25.337	7.4603	0.85424	0.9716
3	2024-04-11	1.0729	164.18	1.9558	25.392	7.4604	0.85525	0.9787
4	2024-04-10	1.0860	164.89	1.9558	25.368	7.4594	0.85515	0.9810

import yfinance as yf

msft = yf.Ticker("MSFT")
msft.info
msft_hist = msft.history(period="6mo")
msft_hist.head()

	Open	High	Low	Close	Volume	Dividends	Stock Splits
Date
2023-10-23 00:00:00-04:00	324.211180	331.443110	323.135370	328.046295	24374700	0.0	0.0
2023-10-24 00:00:00-04:00	330.018628	330.556548	326.332957	329.251617	31153600	0.0	0.0
2023-10-25 00:00:00-04:00	343.685568	344.861028	336.314195	339.352417	55053800	0.0	0.0
2023-10-26 00:00:00-04:00	339.222925	340.308706	325.675519	326.621857	37828500	0.0	0.0
2023-10-27 00:00:00-04:00	329.152021	335.417702	327.129873	328.534424	29856500	0.0	0.0

# create a column for date only (without timezone/hours/minutes/seconds)
msft_hist['date_only'] = pd.to_datetime(msft_hist.index.date)
msft_hist.head()

	Open	High	Low	Close	Volume	Dividends	Stock Splits	date_only
Date
2023-10-23 00:00:00-04:00	324.211180	331.443110	323.135370	328.046295	24374700	0.0	0.0	2023-10-23
2023-10-24 00:00:00-04:00	330.018628	330.556548	326.332957	329.251617	31153600	0.0	0.0	2023-10-24
2023-10-25 00:00:00-04:00	343.685568	344.861028	336.314195	339.352417	55053800	0.0	0.0	2023-10-25
2023-10-26 00:00:00-04:00	339.222925	340.308706	325.675519	326.621857	37828500	0.0	0.0	2023-10-26
2023-10-27 00:00:00-04:00	329.152021	335.417702	327.129873	328.534424	29856500	0.0	0.0	2023-10-27

spy = yf.Ticker("SPY")
spy.info
spy_hist = spy.history(period="6mo")
spy_hist.head()

	Open	High	Low	Close	Volume	Dividends	Stock Splits	Capital Gains
Date
2023-10-23 00:00:00-04:00	416.621136	421.426688	414.824031	417.465088	92035100	0.0	0.0	0.0
2023-10-24 00:00:00-04:00	419.639488	421.794044	417.743089	420.612518	78564200	0.0	0.0	0.0
2023-10-25 00:00:00-04:00	418.884919	418.914704	414.049582	414.575806	94223200	0.0	0.0	0.0
2023-10-26 00:00:00-04:00	413.483664	414.357371	408.668205	409.611420	115156800	0.0	0.0	0.0
2023-10-27 00:00:00-04:00	411.239769	411.646852	406.295230	407.754761	107367700	0.0	0.0	0.0

# create a column for date only (without timezone/hours/minutes/seconds)

spy_hist['date_only'] = pd.to_datetime(spy_hist.index.date)
spy_hist.head()

	Open	High	Low	Close	Volume	Dividends	Stock Splits	Capital Gains	date_only
Date
2023-10-23 00:00:00-04:00	416.621136	421.426688	414.824031	417.465088	92035100	0.0	0.0	0.0	2023-10-23
2023-10-24 00:00:00-04:00	419.639488	421.794044	417.743089	420.612518	78564200	0.0	0.0	0.0	2023-10-24
2023-10-25 00:00:00-04:00	418.884919	418.914704	414.049582	414.575806	94223200	0.0	0.0	0.0	2023-10-25
2023-10-26 00:00:00-04:00	413.483664	414.357371	408.668205	409.611420	115156800	0.0	0.0	0.0	2023-10-26
2023-10-27 00:00:00-04:00	411.239769	411.646852	406.295230	407.754761	107367700	0.0	0.0	0.0	2023-10-27

df_with_spy = rates.merge(spy_hist[['date_only', 'Close']],
                          how='left',
                          left_on='Time',
                          right_on='date_only')
df_with_spy.columns = ['Time', 'USD', 'JPY', 'BGN', 'CZK', 'DKK', 'GBP', 'CHF', 'date_only',
       'Close_spy']
df_with_spy

	Time	USD	JPY	BGN	CZK	DKK	GBP	CHF	date_only	Close_spy
0	2024-04-16	1.0637	164.54	1.9558	25.210	7.4609	0.85440	0.9712	2024-04-16	503.529999
1	2024-04-15	1.0656	164.05	1.9558	25.324	7.4606	0.85405	0.9725	2024-04-15	504.450012
2	2024-04-12	1.0652	163.16	1.9558	25.337	7.4603	0.85424	0.9716	2024-04-12	510.850006
3	2024-04-11	1.0729	164.18	1.9558	25.392	7.4604	0.85525	0.9787	2024-04-11	518.000000
4	2024-04-10	1.0860	164.89	1.9558	25.368	7.4594	0.85515	0.9810	2024-04-10	514.119995
...	...	...	...	...	...	...	...	...	...	...
57	2024-01-24	1.0905	160.46	1.9558	24.786	7.4568	0.85543	0.9415	2024-01-24	483.886566
58	2024-01-23	1.0872	160.88	1.9558	24.824	7.4574	0.85493	0.9446	2024-01-23	483.358185
59	2024-01-22	1.0890	160.95	1.9558	24.758	7.4585	0.85575	0.9458	2024-01-22	481.952576
60	2024-01-19	1.0887	161.17	1.9558	24.813	7.4575	0.85825	0.9459	2024-01-19	480.935699
61	2024-01-18	1.0875	160.89	1.9558	24.734	7.4571	0.85773	0.9432	2024-01-18	475.014099

62 rows × 10 columns

df_with_spy_with_msft = df_with_spy.merge(msft_hist[['date_only', 'Close']], on='date_only')
df_with_spy_with_msft.columns = ['Time', 'USD', 'JPY', 'BGN', 'CZK', 'DKK', 'GBP', 'CHF', 'date_only',
       'Close_spy', 'Close_MSFT']
df_with_spy_with_msft.head()

	Time	USD	JPY	BGN	CZK	DKK	GBP	CHF	date_only	Close_spy	Close_MSFT
0	2024-04-16	1.0637	164.54	1.9558	25.210	7.4609	0.85440	0.9712	2024-04-16	503.529999	414.579987
1	2024-04-15	1.0656	164.05	1.9558	25.324	7.4606	0.85405	0.9725	2024-04-15	504.450012	413.640015
2	2024-04-12	1.0652	163.16	1.9558	25.337	7.4603	0.85424	0.9716	2024-04-12	510.850006	421.899994
3	2024-04-11	1.0729	164.18	1.9558	25.392	7.4604	0.85525	0.9787	2024-04-11	518.000000	427.929993
4	2024-04-10	1.0860	164.89	1.9558	25.368	7.4594	0.85515	0.9810	2024-04-10	514.119995	423.260010

from matplotlib import pyplot as plt

ax = plt.gca()
df_with_spy_with_msft.plot(x='Time', y='JPY', ax=ax)
df_with_spy_with_msft.plot(x='Time', y='Close_spy', ax=ax)
df_with_spy_with_msft.plot(x='Time', y='Close_MSFT', ax=ax)

NYC flights data: plot destination airports with marker size proportional to the number of flights to a particular airports

from nycflights13 import flights, airports, airlines, planes, weather
flights.head()

	year	month	day	dep_time	sched_dep_time	dep_delay	arr_time	sched_arr_time	arr_delay	carrier	flight	tailnum	origin	dest	air_time	distance	hour	minute	time_hour
0	2013	1	1	517.0	515	2.0	830.0	819	11.0	UA	1545	N14228	EWR	IAH	227.0	1400	5	15	2013-01-01T10:00:00Z
1	2013	1	1	533.0	529	4.0	850.0	830	20.0	UA	1714	N24211	LGA	IAH	227.0	1416	5	29	2013-01-01T10:00:00Z
2	2013	1	1	542.0	540	2.0	923.0	850	33.0	AA	1141	N619AA	JFK	MIA	160.0	1089	5	40	2013-01-01T10:00:00Z
3	2013	1	1	544.0	545	-1.0	1004.0	1022	-18.0	B6	725	N804JB	JFK	BQN	183.0	1576	5	45	2013-01-01T10:00:00Z
4	2013	1	1	554.0	600	-6.0	812.0	837	-25.0	DL	461	N668DN	LGA	ATL	116.0	762	6	0	2013-01-01T11:00:00Z

airports.head()

	faa	name	lat	lon	alt	tz	dst	tzone
0	04G	Lansdowne Airport	41.130472	-80.619583	1044	-5	A	America/New_York
1	06A	Moton Field Municipal Airport	32.460572	-85.680028	264	-6	A	America/Chicago
2	06C	Schaumburg Regional	41.989341	-88.101243	801	-6	A	America/Chicago
3	06N	Randall Airport	41.431912	-74.391561	523	-5	A	America/New_York
4	09J	Jekyll Island Airport	31.074472	-81.427778	11	-5	A	America/New_York

planes.head()

	tailnum	year	type	manufacturer	model	engines	seats	speed	engine
0	N10156	2004.0	Fixed wing multi engine	EMBRAER	EMB-145XR	2	55	NaN	Turbo-fan
1	N102UW	1998.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
2	N103US	1999.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
3	N104UW	1999.0	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NaN	Turbo-fan
4	N10575	2002.0	Fixed wing multi engine	EMBRAER	EMB-145LR	2	55	NaN	Turbo-fan

airlines.head()

	carrier	name
0	9E	Endeavor Air Inc.
1	AA	American Airlines Inc.
2	AS	Alaska Airlines Inc.
3	B6	JetBlue Airways
4	DL	Delta Air Lines Inc.

weather.head()

	origin	year	month	day	hour	temp	dewp	humid	wind_dir	wind_speed	wind_gust	pressure	visib	time_hour
0	EWR	2013	1	1	1	39.02	26.06	59.37	270.0	10.35702	NaN	1012.0	10.0	2013-01-01T06:00:00Z
1	EWR	2013	1	1	2	39.02	26.96	61.63	250.0	8.05546	NaN	1012.3	10.0	2013-01-01T07:00:00Z
2	EWR	2013	1	1	3	39.02	28.04	64.43	240.0	11.50780	NaN	1012.5	10.0	2013-01-01T08:00:00Z
3	EWR	2013	1	1	4	39.92	28.04	62.21	250.0	12.65858	NaN	1012.2	10.0	2013-01-01T09:00:00Z
4	EWR	2013	1	1	5	39.02	28.04	64.43	260.0	12.65858	NaN	1011.9	10.0	2013-01-01T10:00:00Z

# count number of flights per airport
series_number_of_flights = flights[['dest']].groupby('dest').size()
series_number_of_flights.name = 'Count flights'
series_number_of_flights

dest
ABQ      254
ACK      265
ALB      439
ANC        8
ATL    17215
       ...  
TPA     7466
TUL      315
TVC      101
TYS      631
XNA     1036
Name: Count flights, Length: 105, dtype: int64

airports_with_flight_counts = airports.merge(series_number_of_flights,
                                            left_on='faa',
                                            right_index=True)
airports_with_flight_counts

	faa	name	lat	lon	alt	tz	dst	tzone	Count flights
87	ABQ	Albuquerque International Sunport	35.040222	-106.609194	5355	-7	A	America/Denver	254
91	ACK	Nantucket Mem	41.253053	-70.060181	48	-5	A	America/New_York	265
118	ALB	Albany Intl	42.748267	-73.801692	285	-5	A	America/New_York	439
128	ANC	Ted Stevens Anchorage Intl	61.174361	-149.996361	152	-9	A	America/Anchorage	8
153	ATL	Hartsfield Jackson Atlanta Intl	33.636719	-84.428067	1026	-5	A	America/New_York	17215
...	...	...	...	...	...	...	...	...	...
1327	TPA	Tampa Intl	27.975472	-82.533250	26	-5	A	America/New_York	7466
1334	TUL	Tulsa Intl	36.198389	-95.888111	677	-6	A	America/Chicago	315
1337	TVC	Cherry Capital Airport	44.741445	-85.582235	624	-5	A	America/New_York	101
1347	TYS	Mc Ghee Tyson	35.810972	-83.994028	981	-5	A	America/New_York	631
1430	XNA	NW Arkansas Regional	36.281869	-94.306811	1287	-6	A	America/Chicago	1036

101 rows × 9 columns

# SFO and LAX (large airport)
airports_with_flight_counts.query('faa == "LAX" or faa == "SFO"')

	faa	name	lat	lon	alt	tz	dst	tzone	Count flights
770	LAX	Los Angeles Intl	33.942536	-118.408075	126	-8	A	America/Los_Angeles	16174
1216	SFO	San Francisco Intl	37.618972	-122.374889	13	-8	A	America/Los_Angeles	13331

fig = px.scatter_geo(airports_with_flight_counts,
                     lat='lat', lon='lon',
                     hover_name="name",
                     fitbounds="locations",
                     size='Count flights'
                    )
fig.show()

series_airport_with_mean_delay = flights[['dest', 'dep_delay']].groupby('dest').mean()
series_airport_with_mean_delay.name = 'delay'
series_airport_with_mean_delay.head()

	dep_delay
dest
ABQ	13.740157
ACK	6.456604
ALB	23.620525
ANC	12.875000
ATL	12.509824

airports_with_flight_counts_with_delays = airports.merge(
    series_number_of_flights,
    left_on='faa',
    right_index=True
).merge(
        series_airport_with_mean_delay,
        left_on='faa',
        right_index=True
)
airports_with_flight_counts_with_delays.query('faa == "LAX" or faa == "SFO"')

	faa	name	lat	lon	alt	tz	dst	tzone	Count flights	dep_delay
770	LAX	Los Angeles Intl	33.942536	-118.408075	126	-8	A	America/Los_Angeles	16174	9.401344
1216	SFO	San Francisco Intl	37.618972	-122.374889	13	-8	A	America/Los_Angeles	13331	12.866289

fig = px.scatter_geo(airports_with_flight_counts_with_delays,
                     lat='lat', lon='lon',
                     hover_name="name",
                     fitbounds="locations",
                     size='Count flights',
                     color='dep_delay'
                    )
fig.show()

Different types of join

left

right

merged

Left merge: Use keys from left frame only (in SQL, “LEFT OUTER JOIN”)

left

right

merged (how=left)

Right merge: Use keys from right frame only (in SQL, “RIGHT OUTER JOIN”)

left

right

merged (how=right)

Outer join: Use union of keys from both frames (“in SQL,”FULL OUTER JOIN”)

left

right

merged (how=outer)

Inner join: Use intersection of keys from both frames (in SQL, “INNER JOIN”)

left

right

merged (how=inner)

how='cross': Create the cartesian product of rows of both frames

left

right

merged (how=cross)

Another application: Enrich our forex rates dataframe with market data

NYC flights data: plot destination airports with marker size proportional to the number of flights to a particular airports

`how='cross'`: Create the cartesian product of rows of both frames