Pandas
import numpy as np
import pandas as pd
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array([10,20,30])
my_dict = {'a' : 10, 'b' : 20, 'c' : 30}
Series
pd.Series(data = my_list)
pd.Series(data = my_list, index = labels)
pd.Series(arr)
pd.Series(arr,labels)
pd.Series(my_dict)
ser_1 = pd.Series([1,2,3,4], index = ['A','B','C','D'])
ser_1
ser_2 = pd.Series([4,3,2,1],['B','C','D','F'])
ser_2
ser_1 + ser_2 # index가 하나만 있을 경우 NaN
DataFrame
Column Selection
df = pd.DataFrame(np.random.randn(5,4), index = 'A B C D E'.split(),
columns = 'W X Y Z'.split()) # use split() to make index or columns
df['W']
df[['W','Z']] # select multiple columns by list
df['New'] = df['W'] + df['Z']
df
Drop Column
df.drop('New', axis = 1) # axis = 1 : column
df.drop('E',axis = 0) # axis = 0 : row
df
df.drop('New', axis = 1, inplace = True) # [inplace = True] : apply to dataframe
df
Row Selection
df.loc['A']
df.iloc[0]
df.loc['B','Y']
df.loc[['A','B'],['X','Y']]
Conditional Selection
df > 0
df[df['X'] > 0]
df[df['X'] > 0][['Y','Z']]
df[(df['X'] > 0) & (df['Y'] < 0)]
df[(df['X'] > 0) | (df['Y'] < 0)]
Reset Index
df.reset_index() # reset index to default
Set Index
alphabet = 'AB CD EF GH IK'.split()
df['new'] = alphabet
df
df.set_index('new', inplace = True) # set new index with existing column
df
Multi-index
outside = 'G1 G1 G1 G2 G2 G2'.split()
inside = [1, 2, 3, 1, 2, 3]
hier_index = list(zip(outside, inside)) # making a list with (outside, inside) tuples
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index
df = pd.DataFrame(np.random.randn(6,2), index = hier_index, columns = ['A', 'B'])
df
df.loc['G1']
df.loc['G1'].loc[1]
df.index.names = ['Group', 'Index'] # column name of multi-index
df
df.xs('G1')
df.xs(('G1',1))
df.xs(1, level = 'Index') # select multiple rows with different group index
Missing data
df = pd.DataFrame({'A' : [1, 2, np.nan],
'B' : [3, np.nan, np.nan],
'C' : [4, 5, 6]})
df.dropna(axis = 1) # drop column which has NaN value
df.dropna(thresh = 2) # accept less than 2 NaN value
df.fillna(value = 'FILL') # fill Nan value with value = 'FILL'
df['A'].fillna(value=df['A'].mean(),inplace = True) # fill Nan value with mean
df
Groupby
# 여러개의 row를 column 기준으로 묶기
df = pd.DataFrame({'Comp' : ['Google', 'Google', 'Microsoft', 'Microsoft', 'Apple', 'Apple'],
'Name' : ['Charlie', 'Sam', 'Jake', 'Paul', 'Ben', 'Kate'],
'Sales' : [100, 200, 300, 400, 500, 600]})
df
df.groupby('Comp')
df.groupby('Comp').mean() # compute mean grouped by Comp
df.groupby('Comp').std()
df.groupby('Comp').min()
df.groupby('Comp').max()
df.groupby('Comp').count()
df.groupby('Comp').describe() # show meaningful data of groups
df.groupby('Comp').describe()
df.groupby('Comp').describe().transpose()['Apple']
Concatenation
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
pd.concat([df1, df2, df3]) # row로 붙이기
pd.concat([df1, df2, df3], axis = 1) # column으로 붙이기
Merging
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
pd.merge(left, right, how = 'inner', on = 'key')
Joining
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
left.join(right)
left.join(right, how = 'outer')
Unique Value, Conditional Selection
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df['col2'].unique() # unique values
df['col2'].nunique() # number of unique values
df['col2'].value_counts() # unique value occurrence
df[(df['col1'] > 2) & (df['col2'] == 444)]
Applying Functions
def times2(x):
return x * 2
df['col1'].apply(times2)
df['col3'].apply(len)
df['col1'].sum()
del df['col1'] # permanently delete column
df.columns
df.index
df.sort_values(by='col2')
df.isnull()
Data Input / Output
df = pd.read_csv('file name') # input
pd.read_excel('file name.xlsx',sheetname='Sheet1')
df.to_csv('file name', index = False) # output
df.to_excel('file name.xlsx',sheet_name='Sheet1'
'Coding > Python' 카테고리의 다른 글
Seaborn 라이브러리 모음 (0) | 2022.02.23 |
---|---|
Matplotlib 라이브러리 모음 (0) | 2022.02.22 |
Numpy 라이브러리 모음 (0) | 2022.02.21 |
[코뮤니티] 클래스의 상속, 메소드 오버라이딩(Overriding) (0) | 2021.09.30 |
[코뮤니티] 클래스와 메소드, 생성자 (0) | 2021.09.29 |