(Python : Basic) DataFrame combining

Posted by : Python at Sep 1, 2020

Category : Python

!pip install finance-datareader
!pip install beautifulsoup4 
!pip install numpy
!pip install pandas

# default settings
import numpy as np
import pandas as pd

# jupyter notebook 여러 실행인자 실행해도 print되게 만들기
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('max_columns', None)

DataFrame row 추가

df = pd.DataFrame(columns=['a', 'b'])
df.head()

	a	b

df.loc[0] = [1, 2]
df.head()

	a	b
0	1	2

df.loc['ㅋㅋ'] = [1, 2]
df.head()

	a	b
0	1	2
ㅋㅋ	1	2

df.loc[len(df)] = {'b' : 'ㅎ', 'a': 'ㅋ'}
df.head()

	a	b
0	1	2
ㅋㅋ	1	2
2	ㅋ	ㅎ

df.loc["yay"] = pd.Series({'a': 'ㅋ', 'b' : 'ㅎ'})
df.tail()

	a	b
0	1	2
ㅋㅋ	1	2
2	ㅋ	ㅎ
yay	ㅋ	ㅎ

# 이미 존재한는 index에 넣기 
df.loc["yay"] = pd.Series({'a': '1111', 'b' : '2222'})
df.tail()

	a	b
0	1	2
ㅋㅋ	1	2
2	ㅋ	ㅎ
yay	1111	2222

append()

names_df = pd.DataFrame(
    {
        'Name':['철수', '영희', '영수', '영미'],
        'Age':[12, 13, 14, 15]
    },
    index = ['Canada', 'Canada', 'USA', 'USA']
)
names_df

	Name	Age
Canada	철수	12
Canada	영희	13
USA	영수	14
USA	영미	15

# Error (에러내용 확인!) => index를 뭐로 설정해야될지 모르기 때문
names_df.append(
    {'Name':'명수', 'Age':1},
    ignore_index=True   # ignore이 없으면 Error
)

	Name	Age
0	철수	12
1	영희	13
2	영수	14
3	영미	15
4	명수	1

# 리스트 of 딕셔너리로도 전달 가능
names_df.append(
    [
        {'Name':'명수', 'Age':1},
        {'Name':'동수', 'Age':2}
    ], 
    ignore_index=True
)

	Name	Age
0	철수	12
1	영희	13
2	영수	14
3	영미	15
4	명수	1
5	동수	2

names_df

	Name	Age
Canada	철수	12
Canada	영희	13
USA	영수	14
USA	영미	15

s = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names_df))
s

Name    Zach
Age        3
Name: 4, dtype: object

names_df.append(s)

	Name	Age
Canada	철수	12
Canada	영희	13
USA	영수	14
USA	영미	15
4	Zach	3

# 리스트 of Series로도 전달 가능
s1 = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names_df))
s2 = pd.Series({'Name': 'Zayd', 'Age': 2}, name='USA')
names_df.append([s1, s2])

	Name	Age
Canada	철수	12
Canada	영희	13
USA	영수	14
USA	영미	15
4	Zach	3
USA	Zayd	2

# 원본에 반영안됨.
names_df

	Name	Age
Canada	철수	12
Canada	영희	13
USA	영수	14
USA	영미	15

s1
s2
pd.concat([s1, s2], axis=1)

Name    Zach
Age        3
Name: 4, dtype: object

Name    Zayd
Age        2
Name: USA, dtype: object

	4	USA
Name	Zach	Zayd
Age	3	2

concat()

!pip install finance-datareader
import FinanceDataReader as fdr

samsung_df = fdr.DataReader('005390', '2009-01-01', '2017-12-31')
kodex_df = fdr.DataReader('069500', '2016-01-01', '2017-12-31')

samsung_df.tail(2)
kodex_df.tail(2)

	Open	High	Low	Close	Volume	Change
Date
2017-12-27	1080	1095	1060	1095	428161	0.033
2017-12-28	1095	1095	1050	1065	559604	-0.027

	Open	High	Low	Close	Volume	Change
Date
2017-12-27	29559	29803	29499	29810	13743517	0.012
2017-12-28	29817	30210	29812	30184	8211476	0.013

pd.concat(
    [samsung_df, kodex_df]
) 
# samsung_df
# kodex_df
# 이런순으로 쌓임

	Open	High	Low	Close	Volume	Change
Date
2009-01-02	200	234	200	234	2122	0.083
2009-01-05	229	234	213	223	6890	-0.047
2009-01-06	210	234	210	212	2144	-0.049
2009-01-07	214	220	208	216	2434	0.019
2009-01-08	216	220	200	215	2280	-0.005
...	...	...	...	...	...	...
2017-12-21	29947	30002	29436	29442	9299734	-0.019
2017-12-22	29518	29646	29444	29584	9212290	0.005
2017-12-26	29643	29821	29458	29464	8257476	-0.004
2017-12-27	29559	29803	29499	29810	13743517	0.012
2017-12-28	29817	30210	29812	30184	8211476	0.013

2718 rows × 6 columns

kodex_df[['Open', 'Close']].head()

	Open	Close
Date
2016-01-04	21477	21121
2016-01-05	21068	21253
2016-01-06	21242	21099
2016-01-07	21043	20896
2016-01-08	20767	21045

pd.concat(
    [
        samsung_df, 
        kodex_df[['Open', 'Close']]
    ]
).tail(2)   # head()도 한번 실행해보세요!

	Open	High	Low	Close	Volume	Change
Date
2017-12-27	29559	NaN	NaN	29810	NaN	NaN
2017-12-28	29817	NaN	NaN	30184	NaN	NaN

pd.concat(
    [samsung_df, kodex_df],
    keys=['삼성', 'KODEX200'], 
).head()

		Open	High	Low	Close	Volume	Change
	Date
삼성	2009-01-02	200	234	200	234	2122	0.083
	2009-01-05	229	234	213	223	6890	-0.047
	2009-01-06	210	234	210	212	2144	-0.049
	2009-01-07	214	220	208	216	2434	0.019
	2009-01-08	216	220	200	215	2280	-0.005

pd.concat(
    [samsung_df, kodex_df],
    keys=['삼성', 'KODEX200'],
    names=['종목명']
).head()

		Open	High	Low	Close	Volume	Change
종목명	Date
삼성	2009-01-02	200	234	200	234	2122	0.083
	2009-01-05	229	234	213	223	6890	-0.047
	2009-01-06	210	234	210	212	2144	-0.049
	2009-01-07	214	220	208	216	2434	0.019
	2009-01-08	216	220	200	215	2280	-0.005

pd.concat(
    [samsung_df, kodex_df],
    keys=['삼성', 'KODEX200'],
    names=['종목명', '날짜']
).head()

		Open	High	Low	Close	Volume	Change
종목명	날짜
삼성	2009-01-02	200	234	200	234	2122	0.083
	2009-01-05	229	234	213	223	6890	-0.047
	2009-01-06	210	234	210	212	2144	-0.049
	2009-01-07	214	220	208	216	2434	0.019
	2009-01-08	216	220	200	215	2280	-0.005

pd.concat([samsung_df, kodex_df], axis=1).head()

	Open	High	Low	Close	Volume	Change	Open	High	Low	Close	Volume	Change
Date
2009-01-02	200	234	200	234	2122	0.083	NaN	NaN	NaN	NaN	NaN	NaN
2009-01-05	229	234	213	223	6890	-0.047	NaN	NaN	NaN	NaN	NaN	NaN
2009-01-06	210	234	210	212	2144	-0.049	NaN	NaN	NaN	NaN	NaN	NaN
2009-01-07	214	220	208	216	2434	0.019	NaN	NaN	NaN	NaN	NaN	NaN
2009-01-08	216	220	200	215	2280	-0.005	NaN	NaN	NaN	NaN	NaN	NaN

# outter join
pd.concat([samsung_df, kodex_df], keys=['삼성', 'KODEX200'], axis=1).head(2)

	삼성						KODEX200
	Open	High	Low	Close	Volume	Change	Open	High	Low	Close	Volume	Change
Date
2009-01-02	200	234	200	234	2122	0.083	NaN	NaN	NaN	NaN	NaN	NaN
2009-01-05	229	234	213	223	6890	-0.047	NaN	NaN	NaN	NaN	NaN	NaN

# inner join
pd.concat([samsung_df, kodex_df], keys=['삼성', 'kodex'], axis=1, names=['종목명'], join='inner').head()

종목명	삼성						kodex
	Open	High	Low	Close	Volume	Change	Open	High	Low	Close	Volume	Change
Date
2016-01-04	1470	1485	1430	1430	862249	-0.017	21477	21485	21109	21121	7823927	-0.020
2016-01-05	1405	1440	1400	1400	844949	-0.021	21068	21318	21068	21253	14025395	0.006
2016-01-06	1415	1435	1390	1400	1061873	0.000	21242	21265	20981	21099	12281225	-0.007
2016-01-07	1405	1415	1350	1350	1184923	-0.036	21043	21122	20837	20896	12663306	-0.010
2016-01-08	1315	1365	1290	1360	886329	0.007	20767	21054	20721	21045	13841294	0.007

# axis=0, inner join
pd.concat([samsung_df.head(), kodex_df[['Close']].head()], join='inner')

	Close
Date
2009-01-02	234
2009-01-05	223
2009-01-06	212
2009-01-07	216
2009-01-08	215
2016-01-04	21121
2016-01-05	21253
2016-01-06	21099
2016-01-07	20896
2016-01-08	21045

samsung_diff_col_df = samsung_df.copy()
samsung_diff_col_df.columns = ['1_' + col  for col in samsung_df.columns]
samsung_diff_col_df.head()
samsung_df.head()

	1_Open	1_High	1_Low	1_Close	1_Volume	1_Change
Date
2009-01-02	200	234	200	234	2122	0.083
2009-01-05	229	234	213	223	6890	-0.047
2009-01-06	210	234	210	212	2144	-0.049
2009-01-07	214	220	208	216	2434	0.019
2009-01-08	216	220	200	215	2280	-0.005

	Open	High	Low	Close	Volume	Change
Date
2009-01-02	200	234	200	234	2122	0.083
2009-01-05	229	234	213	223	6890	-0.047
2009-01-06	210	234	210	212	2144	-0.049
2009-01-07	214	220	208	216	2434	0.019
2009-01-08	216	220	200	215	2280	-0.005

pd.concat([samsung_diff_col_df, kodex_df]).head()

	1_Open	1_High	1_Low	1_Close	1_Volume	1_Change	Open	High	Low	Close	Volume	Change
Date
2009-01-02	200.000	234.000	200.000	234.000	2122.000	0.083	NaN	NaN	NaN	NaN	NaN	NaN
2009-01-05	229.000	234.000	213.000	223.000	6890.000	-0.047	NaN	NaN	NaN	NaN	NaN	NaN
2009-01-06	210.000	234.000	210.000	212.000	2144.000	-0.049	NaN	NaN	NaN	NaN	NaN	NaN
2009-01-07	214.000	220.000	208.000	216.000	2434.000	0.019	NaN	NaN	NaN	NaN	NaN	NaN
2009-01-08	216.000	220.000	200.000	215.000	2280.000	-0.005	NaN	NaN	NaN	NaN	NaN	NaN

Example

total_df = pd.concat([samsung_df, kodex_df], keys=['삼성', 'kodex200'], names=['종목명'])
total_df.head()
total_df.tail()

		Open	High	Low	Close	Volume	Change
종목명	Date
삼성	2009-01-02	200	234	200	234	2122	0.083
	2009-01-05	229	234	213	223	6890	-0.047
	2009-01-06	210	234	210	212	2144	-0.049
	2009-01-07	214	220	208	216	2434	0.019
	2009-01-08	216	220	200	215	2280	-0.005

		Open	High	Low	Close	Volume	Change
종목명	Date
kodex200	2017-12-21	29947	30002	29436	29442	9299734	-0.019
	2017-12-22	29518	29646	29444	29584	9212290	0.005
	2017-12-26	29643	29821	29458	29464	8257476	-0.004
	2017-12-27	29559	29803	29499	29810	13743517	0.012
	2017-12-28	29817	30210	29812	30184	8211476	0.013

total_df = total_df.reset_index()
total_df.head()

	종목명	Date	Open	High	Low	Close	Volume	Change
0	삼성	2009-01-02	200	234	200	234	2122	0.083
1	삼성	2009-01-05	229	234	213	223	6890	-0.047
2	삼성	2009-01-06	210	234	210	212	2144	-0.049
3	삼성	2009-01-07	214	220	208	216	2434	0.019
4	삼성	2009-01-08	216	220	200	215	2280	-0.005

total_df.pivot(index='Date', columns='종목명', values='Close')

종목명	kodex200	삼성
Date
2009-01-02	NaN	234.000
2009-01-05	NaN	223.000
2009-01-06	NaN	212.000
2009-01-07	NaN	216.000
2009-01-08	NaN	215.000
...	...	...
2017-12-21	29442.000	1050.000
2017-12-22	29584.000	1050.000
2017-12-26	29464.000	1060.000
2017-12-27	29810.000	1095.000
2017-12-28	30184.000	1065.000

2229 rows × 2 columns

pivot()

sample_data = pd.DataFrame(
    {
        "종목명":["삼성", "현대", "하이닉스", "삼성", "현대", "하이닉스"],
        "datetime":["2019-01-01", "2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02", "2019-01-02"],
        "price":[1,2,3, 4,5,6]
    }
)
sample_data

	종목명	datetime	price
0	삼성	2019-01-01	1
1	현대	2019-01-01	2
2	하이닉스	2019-01-01	3
3	삼성	2019-01-02	4
4	현대	2019-01-02	5
5	하이닉스	2019-01-02	6

sample_data.sort_values("종목명")

	종목명	datetime	price
0	삼성	2019-01-01	1
3	삼성	2019-01-02	4
2	하이닉스	2019-01-01	3
5	하이닉스	2019-01-02	6
1	현대	2019-01-01	2
4	현대	2019-01-02	5

sample_data.pivot(index="datetime", columns="종목명", values="price")

종목명	삼성	하이닉스	현대
datetime
2019-01-01	1	3	2
2019-01-02	4	6	5

join()

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 
left

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])
right

	A	B
K0	A0	B0
K1	A1	B1
K2	A2	B2

	C	D
K0	C0	D0
K2	C2	D2
K3	C3	D3

left.join(right)

	A	B	C	D
K0	A0	B0	C0	D0
K1	A1	B1	NaN	NaN
K2	A2	B2	C2	D2

left.join(right, how='outer')

	A	B	C	D
K0	A0	B0	C0	D0
K1	A1	B1	NaN	NaN
K2	A2	B2	C2	D2
K3	NaN	NaN	C3	D3

Example

left = pd.DataFrame(
    {
        'A':['A0', 'A1', 'A2', 'A3'],
        'B':['B0', 'B1', 'B2', 'B3'],
        'key':['K0', 'K1', 'K0', 'K1'],
    }
)
right = pd.DataFrame(
    {
        'C':['C0', 'C1'],
        'D':['D0', 'D1'],
    },
    index=['K0', 'K1']
)
left
right

	A	B	key
0	A0	B0	K0
1	A1	B1	K1
2	A2	B2	K0
3	A3	B3	K1

	C	D
K0	C0	D0
K1	C1	D1

left.join(right, on='key').set_index("key")

	A	B	C	D
key
K0	A0	B0	C0	D0
K1	A1	B1	C1	D1
K0	A2	B2	C0	D0
K1	A3	B3	C1	D1

left.set_index('key').join(right)

	A	B	C	D
K0	A0	B0	C0	D0
K0	A2	B2	C0	D0
K1	A1	B1	C1	D1
K1	A3	B3	C1	D1

a = pd.DataFrame([1,2,3], index=['a','b','c'], columns=['안녕'])
b = pd.DataFrame([4,2,6], index=['a','c','d'], columns=['안녕'])
a
b

	안녕
a	1
b	2
c	3

	안녕
a	4
c	2
d	6

a.join(b, lsuffix="_x", rsuffix="_y", how="inner")

	안녕_x	안녕_y
a	1	4
c	3	2

a.join(b, lsuffix="_x", rsuffix="_y")

	안녕_x	안녕_y
a	1	4.000
b	2	NaN
c	3	2.000

a1_df = pd.read_csv("data/Small_and_Big.csv")
a_df = pd.read_csv("data/Small_and_Big.csv", index_col=[0])
a1_df.head()
a_df.head()

	Unnamed: 0	date	종목명	PBR(IFRS-연결)	베타 (M,5Yr)	수익률(%)	시가총액 (보통)(평균)(원)
0	0	2000-07-31	BYC	0.210	0.479	-0.580	27786000000.000
1	1	2000-07-31	CJ	0.510	1.166	-9.000	1160889000000.000
2	2	2000-07-31	CJ ENM	6.560	NaN	17.400	400467000000.000
3	3	2000-07-31	CJ대한통운	0.170	1.314	-7.960	194962000000.000
4	4	2000-07-31	CJ씨푸드	NaN	0.227	32.000	1987000000.000

	date	종목명	PBR(IFRS-연결)	베타 (M,5Yr)	수익률(%)	시가총액 (보통)(평균)(원)
0	2000-07-31	BYC	0.210	0.479	-0.580	27786000000.000
1	2000-07-31	CJ	0.510	1.166	-9.000	1160889000000.000
2	2000-07-31	CJ ENM	6.560	NaN	17.400	400467000000.000
3	2000-07-31	CJ대한통운	0.170	1.314	-7.960	194962000000.000
4	2000-07-31	CJ씨푸드	NaN	0.227	32.000	1987000000.000

median_df = a_df.groupby(['date']).agg({'시가총액 (보통)(평균)(원)': 'median'})
median_df.columns = ['시가총액_median']
median_df.head()

	시가총액_median
date
2000-07-31	34947000000.000
2000-08-31	33684000000.000
2000-09-30	33684000000.000
2000-10-31	30523000000.000
2000-11-30	30798000000.000

joined_df = a_df.join(median_df, on="date")
joined_df.head()

	date	종목명	PBR(IFRS-연결)	베타 (M,5Yr)	수익률(%)	시가총액 (보통)(평균)(원)	시가총액_median
0	2000-07-31	BYC	0.210	0.479	-0.580	27786000000.000	34947000000.000
1	2000-07-31	CJ	0.510	1.166	-9.000	1160889000000.000	34947000000.000
2	2000-07-31	CJ ENM	6.560	NaN	17.400	400467000000.000	34947000000.000
3	2000-07-31	CJ대한통운	0.170	1.314	-7.960	194962000000.000	34947000000.000
4	2000-07-31	CJ씨푸드	NaN	0.227	32.000	1987000000.000	34947000000.000

joined_df[joined_df['date'] == "2000-08-31"].head()

	date	종목명	PBR(IFRS-연결)	베타 (M,5Yr)	수익률(%)	시가총액 (보통)(평균)(원)	시가총액_median
573	2000-08-31	BYC	0.220	0.477	3.500	27221000000.000	33684000000.000
574	2000-08-31	CJ	0.510	1.165	0.000	1052967000000.000	33684000000.000
575	2000-08-31	CJ ENM	6.730	NaN	2.680	373822000000.000	33684000000.000
576	2000-08-31	CJ대한통운	0.190	1.312	7.040	189622000000.000	33684000000.000
577	2000-08-31	CJ씨푸드	NaN	0.222	62.880	2765000000.000	33684000000.000

# Hint: 아래와 같은 느낌으로 하시면 됩니다.
# cond1 = joined_df['시가총액(보통~~)'] <  joined_df['시가총액_median']
# joined_df.loc[cond1, "small_or_big"] = "small"
# joined_df.loc[~cond1, "small_or_big"] = "big"

Merge()

left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                               'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})

left
right

	key1	key2	A	B
0	K0	K0	A0	B0
1	K0	K1	A1	B1
2	K1	K0	A2	B2
3	K2	K1	A3	B3

	key1	key2	C	D
0	K0	K0	C0	D0
1	K1	K0	C1	D1
2	K1	K0	C2	D2
3	K2	K0	C3	D3

# default: inner join(교집합)
pd.merge(left, right, on=['key1', 'key2'])

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K1	K0	A2	B2	C1	D1
2	K1	K0	A2	B2	C2	D2

# outer join(합집합)
pd.merge(left, right, how='outer', on=['key1', 'key2'])

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN
5	K2	K0	NaN	NaN	C3	D3

pd.merge(left, right, how='right', on=['key1', 'key2'])
pd.merge(left, right, how='left', on=['key1', 'key2'])

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K1	K0	A2	B2	C1	D1
2	K1	K0	A2	B2	C2	D2
3	K2	K0	NaN	NaN	C3	D3

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN

left = pd.DataFrame({'A':[1,2,], 'B':[2,2]})
right = pd.DataFrame({'A':[4,5,6], 'B':[2,2,2]})
left
right

	A	B
0	1	2
1	2	2

	A	B
0	4	2
1	5	2
2	6	2

# left, right, inner, outer 결과가 다 같음
pd.merge(left, right, on="B", how='left')

	A_x	B	A_y
0	1	2	4
1	1	2	5
2	1	2	6
3	2	2	4
4	2	2	5
5	2	2	6

Example

close_df = samsung_df['Close'].reset_index()
vol_df = samsung_df['Volume'].reset_index()
close_df.head()
vol_df.head()

	Date	Close
0	2009-01-02	234
1	2009-01-05	223
2	2009-01-06	212
3	2009-01-07	216
4	2009-01-08	215

	Date	Volume
0	2009-01-02	2122
1	2009-01-05	6890
2	2009-01-06	2144
3	2009-01-07	2434
4	2009-01-08	2280

vol_df.iloc[:2]

	Date	Volume
0	2009-01-02	2122
1	2009-01-05	6890

# default is 'inner' join
pd.merge(close_df, vol_df.iloc[:2])   # 알아서 같은 column 이름끼리 맞춤

	Date	Close	Volume
0	2009-01-02	234	2122
1	2009-01-05	223	6890

# 'outer' join
pd.merge(close_df, vol_df.iloc[:2], how="outer").head(5)

	Date	Close	Volume
0	2009-01-02	234	2122.000
1	2009-01-05	223	6890.000
2	2009-01-06	212	NaN
3	2009-01-07	216	NaN
4	2009-01-08	215	NaN

join, merge 어디에 써야하나?

join - index가 하나라도 관여
merge - column에 맞춰야 한다면
- merge를 사용할때 left_index, right_index를 사용하면 join과 같은 결과
- join 사용시 reset_index 사용시 merge와 같은결과

a = pd.DataFrame([1,2,3], index=['a','b','c'], columns=['안녕'])
b = pd.DataFrame([4,2,6], index=['a','c','d'], columns=['안녕'])
a
b

	안녕
a	1
b	2
c	3

	안녕
a	4
c	2
d	6

a.merge(b)
a.join(b, lsuffix="_x", rsuffix="_y", how="inner")

	안녕
0	2

	안녕_x	안녕_y
a	1	4
c	3	2

a.reset_index()
b.reset_index()
a.reset_index().merge(b.reset_index())

	index	안녕
0	a	1
1	b	2
2	c	3

	index	안녕
0	a	4
1	c	2
2	d	6

	index	안녕

a.merge(b, left_index=True, right_index=True)

	안녕_x	안녕_y
a	1	4
c	3	2

concat과 join, merge

a = pd.DataFrame({"a": [1,2,3],}, index=[1,2,3])
b = pd.DataFrame({"b": [1,4,5],}, index=[1,4,5])
a
b

	a
1	1
2	2
3	3

	b
1	1
4	4
5	5

pd.concat([a, b], axis=1)

	a	b
1	1.000	1.000
2	2.000	NaN
3	3.000	NaN
4	NaN	4.000
5	NaN	5.000

a = pd.DataFrame({"a": [1,2,3],}, index=[1,2,2])
b = pd.DataFrame({"b": [1,4,5],}, index=[1,4,5])
a
b

	a
1	1
2	2
2	3

	b
1	1
4	4
5	5

# pd.concat([a, b], axis=1)
# error 발생! => concat()을 cartesian product가 불가능하기 때문에, 중복 index 혹은 column이 있는 경우 작동하지 못함

Example

product_df = pd.read_csv("data/product.csv", index_col=0)
review_df = pd.read_csv("data/review.csv", index_col=0)
product_df.head(2)
review_df.head(2)

	id	brand	title	price
0	384	apple	apple iphone 6 (space grey, 32 gb)	23999.000
1	385	apple	apple iphone 6s (space grey, 32 gb)	33999.000

	product__id	author	rating	title	content	date
0	351	Subhankar Mishra	5.000	AnyOne has ordered yet ?	Has anyone ordered yet ? Please tell us your e...	2011-07-22
1	351	Chinmay Chopade	4.000	It does what it says !!!	I bought this phone and been using for last we...	2011-08-23

flipkart_df = pd.merge(
    product_df, 
    review_df, 

    left_on="id",
    right_on='product__id',
    
    how='right',    # Review에 있는 id, 즉 product__id를 기준으로 데이터를 생성합니다. 만약 "product" 정보가 반드시 존재하는 review들로만 데이터를 구성하고 싶으면 "left"로 하시면 됩니다.
)  
flipkart_df.head(2)

	id	brand	title_x	price	product__id	author	rating	title_y	content	date
0	351	micromax	micromax a70 (cocoa brown, 160 mb)	7005.000	351	Subhankar Mishra	5.000	AnyOne has ordered yet ?	Has anyone ordered yet ? Please tell us your e...	2011-07-22
1	351	micromax	micromax a70 (cocoa brown, 160 mb)	7005.000	351	Chinmay Chopade	4.000	It does what it says !!!	I bought this phone and been using for last we...	2011-08-23

# column을 제거 하기 위해서는 drop([컬럼1, 컬럼2, ..], axis=1)과 같은 방식으로 진행합니다
flipkart_df = flipkart_df.drop(['id', 'product__id', 'author'], axis=1)
flipkart_df.head(2)

	brand	title_x	price	rating	title_y	content	date
0	micromax	micromax a70 (cocoa brown, 160 mb)	7005.000	5.000	AnyOne has ordered yet ?	Has anyone ordered yet ? Please tell us your e...	2011-07-22
1	micromax	micromax a70 (cocoa brown, 160 mb)	7005.000	4.000	It does what it says !!!	I bought this phone and been using for last we...	2011-08-23

Example-2

amazon_df = pd.read_csv("data/amazon_review1.csv", index_col=0)
amazon_df.head(2)
amazon_df.shape
flipkart_df.shape

	brand	title_x	price	title_y	content	rating
date
2013-06-25	apple	apple iphone 5 (black-slate, 16gb)	14499.000	well an iphone is an iphone!	Piece of art you like to keep holding onto, th...	5
2013-06-25	samsung	samsung galaxy s4 gt-i9500 (white frost, 16gb)	19439.000	really a life companion device	I got my S4 few days back n this device is awe...	5

(42190, 6)

(27008, 7)

df = pd.concat([amazon_df, flipkart_df], axis=0)   # `keys` argument를 활용하셔서 site를 구분하셔도 됩니다
df.shape

(69198, 7)

df['date'] = pd.to_datetime(df['date'])
df['price'] = df['price'].astype(float)

df.set_index('date', inplace=True)
df = df.loc[:"2017-12-31"]   # 비록 DatetimeIndex이지만, 날짜를 문자열 string으로 표현하여 loc을 이용한 range indexing이 가능합니다.

df.rename(columns={'title_x':'name', 'title_y':'title'}, inplace=True)

df['price_grp'] = pd.cut(df['price'], [0, 5000, 15000, 200000], labels=["저가", "중가", "고가"])

df.head()

	brand	name	price	title	content	rating	price_grp
date
2011-07-22	micromax	micromax a70 (cocoa brown, 160 mb)	7005.000	AnyOne has ordered yet ?	Has anyone ordered yet ? Please tell us your e...	5.000	중가
2011-08-23	micromax	micromax a70 (cocoa brown, 160 mb)	7005.000	It does what it says !!!	I bought this phone and been using for last we...	4.000	중가
2011-08-23	micromax	micromax a70 (cocoa brown, 160 mb)	7005.000	Value for money	Excellent phone altogether. Only negative poin...	5.000	중가
2011-08-23	micromax	micromax a70 (cocoa brown, 160 mb)	7005.000	Good for Price and delivery speed....	Worth for the money..... How much you are payi...	5.000	중가
2011-09-15	micromax	micromax a70 (cocoa brown, 160 mb)	7005.000	FLIPKART ROCKS	i am a regular user of ebay india, and i usual...	5.000	중가

Example-3

# 아래에서 보이시는 것처럼 groupby()의 인자에는 반드시 'column명'이 아니어도 됩니다.
# 아래 예제처럼 df라는 object로부터 추출할 수 있는(여기서는 df.index에서 추출한) categorical 변수들을 사용해도 됩니다.
df.groupby([df.index.year, df.index.quarter]).agg({'price':'mean'})

		price
date	date
2011	3	7005.000
2011	4	6564.214
2012	1	5837.250
	2	4456.945
	3	5203.164
	4	8851.987
2013	1	8228.694
	2	13098.951
	3	10574.770
	4	10106.358
2014	1	8955.434
	2	8895.550
	3	8969.058
	4	12091.026
2015	1	10409.730
	2	17200.320
	3	16498.556
	4	18839.613
2016	1	21684.773
	2	19896.658
	3	17594.872
	4	23765.043
2017	1	17576.598
	2	19595.137
	3	17528.829
	4	27268.716

df.groupby([df.index.year, df.index.quarter]).agg({'price':'mean'}).plot(kind='bar')

'''
ax = df.resample("Q")['price'].mean().plot();

ax.set_title("기간별 제품 평균가격");
ax.set_xlabel("기 간");
ax.set_ylabel("가 격");
'''

Example-4

df.groupby(['brand']).agg({'name':'count'})

	name
brand
apple	5399
mi	1558
micromax	8078
samsung	7284

df.groupby(['brand']).agg({'name':'count'}).plot(kind='bar', figsize=(8,5));

png

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN
5	K2	K0	NaN	NaN	C3	D3

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN
5	K2	K0	NaN	NaN	C3	D3

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN
5	K2	K0	NaN	NaN	C3	D3

	key1	key2	A	B	C	D
0	K0	K0	A0	B0	C0	D0
1	K0	K1	A1	B1	NaN	NaN
2	K1	K0	A2	B2	C1	D1
3	K1	K0	A2	B2	C2	D2
4	K2	K1	A3	B3	NaN	NaN