Provides two new data types for storing data:
series - each column of a datafram is a series.
dataframe - made up of several sereies.
Pandas also has SQL-like functions for merging, joining, and sorting dataframes.
import pandas as pd
import numpy as np
mylist = [5.4,6.1,1.7,99.8]
myarray = np.array(mylist)
myseries1 = pd.Series(data=mylist)
print(myseries1)
myseries2= pd.Series(data=myarray)
print(myseries2)
0 5.4 1 6.1 2 1.7 3 99.8 dtype: float64 0 5.4 1 6.1 2 1.7 3 99.8 dtype: float64
NOTE: float64 means number with 64 bits of precision.
print(myseries1[2]) # like arrays can access individual entries
1.7
mylabels=['first','second','third','fourth']
myseries3 = pd.Series(data=mylist,index=mylabels) #add labels to entries in a series
print(myseries3)
first 5.4 second 6.1 third 1.7 fourth 99.8 dtype: float64
myseries4 = pd.Series(mylist,mylabels) #NOTE: you do not have to be explicit about the entries of pd.Series
print(myseries4)
first 5.4 second 6.1 third 1.7 fourth 99.8 dtype: float64
print(myseries4['second']) #Access entries using the index labels
6.1
# math on series
myseries5 = pd.Series([5.5,1.1,8.8,1.6],['first','third','fourth','fifth'])
print(myseries4)
print('+')
print(myseries5)
print('__________________')
print(myseries5+myseries4)
first 5.4 second 6.1 third 1.7 fourth 99.8 dtype: float64 + first 5.5 third 1.1 fourth 8.8 fifth 1.6 dtype: float64 __________________ fifth NaN first 10.9 fourth 108.6 second NaN third 2.8 dtype: float64
df1 = pd.concat([myseries4,myseries5],axis=1,sort=False)
#axis=1 says make multiple columns axis=0 rows
#sort=Alphabetical sort?
df1
0 | 1 | |
---|---|---|
first | 5.4 | 5.5 |
second | 6.1 | NaN |
third | 1.7 | 1.1 |
fourth | 99.8 | 8.8 |
fifth | NaN | 1.6 |
df2 = pd.DataFrame(np.random.randn(5,5)) # Create new dataframe
df2
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | -0.950540 | 1.590717 | -0.447912 | 0.244753 | 0.258761 |
1 | 0.711849 | -1.604905 | 0.006943 | 0.867151 | 1.645310 |
2 | 1.830404 | -0.054315 | 1.313139 | 0.768555 | 1.310385 |
3 | -1.132748 | -0.333121 | 1.040200 | -0.003932 | 0.712933 |
4 | -1.235484 | -1.019824 | -0.172312 | 0.665532 | -1.287178 |
df3 = pd.DataFrame(np.random.randn(5,5),index=['first row','second row','third row','fourth row','fifth row'],
columns=['first col','second col','third col','fourth col','fifth col'])
df3
first col | second col | third col | fourth col | fifth col | |
---|---|---|---|---|---|
first row | -1.586390 | -0.200170 | 1.243514 | 0.827881 | -1.850758 |
second row | -0.780710 | 0.195434 | -1.698385 | 0.782769 | 1.194898 |
third row | 0.687090 | 0.002575 | -0.793467 | -0.518523 | 0.029827 |
fourth row | -0.009804 | -0.593658 | 0.535864 | -1.080070 | -0.648657 |
fifth row | -2.271348 | 0.050926 | -1.529726 | -0.041977 | 1.402452 |
# Access individual series in a data frame
print(df3['second col']) # NOTE can only acces column like this. Not row.
print('')
df3[['third col','first col']] # two columns together are a dataframe
first row -0.200170 second row 0.195434 third row 0.002575 fourth row -0.593658 fifth row 0.050926 Name: second col, dtype: float64
third col | first col | |
---|---|---|
first row | 1.243514 | -1.586390 |
second row | -1.698385 | -0.780710 |
third row | -0.793467 | 0.687090 |
fourth row | 0.535864 | -0.009804 |
fifth row | -1.529726 | -2.271348 |
# Access rows of a dataframe
df3.loc['fourth row']
first col -0.009804 second col -0.593658 third col 0.535864 fourth col -1.080070 fifth col -0.648657 Name: fourth row, dtype: float64
df3.iloc[2] # Access via row index number
first col 0.687090 second col 0.002575 third col -0.793467 fourth col -0.518523 fifth col 0.029827 Name: third row, dtype: float64
df3.loc[['fourth row','first row'],['second col','third col']]
second col | third col | |
---|---|---|
fourth row | -0.593658 | 0.535864 |
first row | -0.200170 | 1.243514 |
# logical indexing for dataframes
df3>0
first col | second col | third col | fourth col | fifth col | |
---|---|---|---|---|---|
first row | False | False | True | True | False |
second row | False | True | False | True | True |
third row | True | True | False | False | True |
fourth row | False | False | True | False | False |
fifth row | False | True | False | False | True |
print(df3[df3>0])
first col second col third col fourth col fifth col first row NaN NaN 1.243514 0.827881 NaN second row NaN 0.195434 NaN 0.782769 1.194898 third row 0.68709 0.002575 NaN NaN 0.029827 fourth row NaN NaN 0.535864 NaN NaN fifth row NaN 0.050926 NaN NaN 1.402452
axis=0 is row axis=1 is a column
# Add columsn to a dataframe
df3['sixth col'] = np.random.randn(5,1)
df3
first col | second col | third col | fourth col | fifth col | sixth col | |
---|---|---|---|---|---|---|
first row | -1.586390 | -0.200170 | 1.243514 | 0.827881 | -1.850758 | 0.840793 |
second row | -0.780710 | 0.195434 | -1.698385 | 0.782769 | 1.194898 | 0.951449 |
third row | 0.687090 | 0.002575 | -0.793467 | -0.518523 | 0.029827 | 1.069176 |
fourth row | -0.009804 | -0.593658 | 0.535864 | -1.080070 | -0.648657 | -1.014823 |
fifth row | -2.271348 | 0.050926 | -1.529726 | -0.041977 | 1.402452 | 0.213570 |
df3.drop('first col',axis=1)# No showing first col but still there.
second col | third col | fourth col | fifth col | sixth col | |
---|---|---|---|---|---|
first row | -0.200170 | 1.243514 | 0.827881 | -1.850758 | 0.840793 |
second row | 0.195434 | -1.698385 | 0.782769 | 1.194898 | 0.951449 |
third row | 0.002575 | -0.793467 | -0.518523 | 0.029827 | 1.069176 |
fourth row | -0.593658 | 0.535864 | -1.080070 | -0.648657 | -1.014823 |
fifth row | 0.050926 | -1.529726 | -0.041977 | 1.402452 | 0.213570 |
# Proof
df3
first col | second col | third col | fourth col | fifth col | sixth col | |
---|---|---|---|---|---|---|
first row | -1.586390 | -0.200170 | 1.243514 | 0.827881 | -1.850758 | 0.840793 |
second row | -0.780710 | 0.195434 | -1.698385 | 0.782769 | 1.194898 | 0.951449 |
third row | 0.687090 | 0.002575 | -0.793467 | -0.518523 | 0.029827 | 1.069176 |
fourth row | -0.009804 | -0.593658 | 0.535864 | -1.080070 | -0.648657 | -1.014823 |
fifth row | -2.271348 | 0.050926 | -1.529726 | -0.041977 | 1.402452 | 0.213570 |
df4 = df3.drop('first col',axis=1)
df4
second col | third col | fourth col | fifth col | sixth col | |
---|---|---|---|---|---|
first row | -0.200170 | 1.243514 | 0.827881 | -1.850758 | 0.840793 |
second row | 0.195434 | -1.698385 | 0.782769 | 1.194898 | 0.951449 |
third row | 0.002575 | -0.793467 | -0.518523 | 0.029827 | 1.069176 |
fourth row | -0.593658 | 0.535864 | -1.080070 | -0.648657 | -1.014823 |
fifth row | 0.050926 | -1.529726 | -0.041977 | 1.402452 | 0.213570 |
df5 = df3.drop('second row', axis=0)
df5
first col | second col | third col | fourth col | fifth col | sixth col | |
---|---|---|---|---|---|---|
first row | -1.586390 | -0.200170 | 1.243514 | 0.827881 | -1.850758 | 0.840793 |
third row | 0.687090 | 0.002575 | -0.793467 | -0.518523 | 0.029827 | 1.069176 |
fourth row | -0.009804 | -0.593658 | 0.535864 | -1.080070 | -0.648657 | -1.014823 |
fifth row | -2.271348 | 0.050926 | -1.529726 | -0.041977 | 1.402452 | 0.213570 |
reset_index
df5.reset_index() # Removes Index Labels and creats a new column with values from the previous Index. Now a series.
index | first col | second col | third col | fourth col | fifth col | sixth col | |
---|---|---|---|---|---|---|---|
0 | first row | -1.586390 | -0.200170 | 1.243514 | 0.827881 | -1.850758 | 0.840793 |
1 | third row | 0.687090 | 0.002575 | -0.793467 | -0.518523 | 0.029827 | 1.069176 |
2 | fourth row | -0.009804 | -0.593658 | 0.535864 | -1.080070 | -0.648657 | -1.014823 |
3 | fifth row | -2.271348 | 0.050926 | -1.529726 | -0.041977 | 1.402452 | 0.213570 |
df5 # still what it was before
first col | second col | third col | fourth col | fifth col | sixth col | |
---|---|---|---|---|---|---|
first row | -1.586390 | -0.200170 | 1.243514 | 0.827881 | -1.850758 | 0.840793 |
third row | 0.687090 | 0.002575 | -0.793467 | -0.518523 | 0.029827 | 1.069176 |
fourth row | -0.009804 | -0.593658 | 0.535864 | -1.080070 | -0.648657 | -1.014823 |
fifth row | -2.271348 | 0.050926 | -1.529726 | -0.041977 | 1.402452 | 0.213570 |
df5.reset_index(inplace=True)
df5
index | first col | second col | third col | fourth col | fifth col | sixth col | |
---|---|---|---|---|---|---|---|
0 | first row | -1.586390 | -0.200170 | 1.243514 | 0.827881 | -1.850758 | 0.840793 |
1 | third row | 0.687090 | 0.002575 | -0.793467 | -0.518523 | 0.029827 | 1.069176 |
2 | fourth row | -0.009804 | -0.593658 | 0.535864 | -1.080070 | -0.648657 | -1.014823 |
3 | fifth row | -2.271348 | 0.050926 | -1.529726 | -0.041977 | 1.402452 | 0.213570 |
# Assign new names to the index
df5['new name'] = ['This','is','the','row']
df5
index | first col | second col | third col | fourth col | fifth col | sixth col | new name | |
---|---|---|---|---|---|---|---|---|
0 | first row | -1.586390 | -0.200170 | 1.243514 | 0.827881 | -1.850758 | 0.840793 | This |
1 | third row | 0.687090 | 0.002575 | -0.793467 | -0.518523 | 0.029827 | 1.069176 | is |
2 | fourth row | -0.009804 | -0.593658 | 0.535864 | -1.080070 | -0.648657 | -1.014823 | the |
3 | fifth row | -2.271348 | 0.050926 | -1.529726 | -0.041977 | 1.402452 | 0.213570 | row |
df5.set_index('new name',inplace=True)
df5
index | first col | second col | third col | fourth col | fifth col | sixth col | |
---|---|---|---|---|---|---|---|
new name | |||||||
This | first row | -1.586390 | -0.200170 | 1.243514 | 0.827881 | -1.850758 | 0.840793 |
is | third row | 0.687090 | 0.002575 | -0.793467 | -0.518523 | 0.029827 | 1.069176 |
the | fourth row | -0.009804 | -0.593658 | 0.535864 | -1.080070 | -0.648657 | -1.014823 |
row | fifth row | -2.271348 | 0.050926 | -1.529726 | -0.041977 | 1.402452 | 0.213570 |
df5.mean()
first col -0.795113 second col -0.185082 third col -0.135954 fourth col -0.203172 fifth col -0.266784 sixth col 0.277179 dtype: float64
#if some series has multiple of the same value then we can group all the unique entries together
mydict = ({'customer': ['Customer 1','Customer 1','Customer 2', 'Customer 2','Customer 3','Customer 3'],
'product1': [1.1,2.1,3.8,4.2,5.5,6.9],
'product2': [8.2,9.1,11.1,5.2,44.66,983]})
df6 = pd.DataFrame(mydict,index=['Purchase 1','Purchase 2','Purchase 3','Purchase 4', 'Purchase 5','Purchase 6'])
df6
customer | product1 | product2 | |
---|---|---|---|
Purchase 1 | Customer 1 | 1.1 | 8.20 |
Purchase 2 | Customer 1 | 2.1 | 9.10 |
Purchase 3 | Customer 2 | 3.8 | 11.10 |
Purchase 4 | Customer 2 | 4.2 | 5.20 |
Purchase 5 | Customer 3 | 5.5 | 44.66 |
Purchase 6 | Customer 3 | 6.9 | 983.00 |
Ways to combine dataframes are similar to SQL.
Three methods:
1. concat
2. join
3. merge
df7 = pd.DataFrame({"customer":['101','102','103','104'],
'category':['cat2','cat2','cat1','cat3'],
'important': ['yes','no','yes','yes'],
'sales': [123,52,214,663]},index=[0,1,2,3])
df8 = pd.DataFrame({"customer":['101','103','104','105'],
'color': ['yellow','green','green','blue'],
'distance':[12,9,44,21],
'sales':[123,214,663,331]},index=[4,5,6,7]) #change index values
pd.concat([df7,df8],axis=0,sort=False) # Concat by Row. Stack on top of each other.
customer | category | important | sales | color | distance | |
---|---|---|---|---|---|---|
0 | 101 | cat2 | yes | 123 | NaN | NaN |
1 | 102 | cat2 | no | 52 | NaN | NaN |
2 | 103 | cat1 | yes | 214 | NaN | NaN |
3 | 104 | cat3 | yes | 663 | NaN | NaN |
4 | 101 | NaN | NaN | 123 | yellow | 12.0 |
5 | 103 | NaN | NaN | 214 | green | 9.0 |
6 | 104 | NaN | NaN | 663 | green | 44.0 |
7 | 105 | NaN | NaN | 331 | blue | 21.0 |
pd.concat([df7,df8],axis=0,sort=True) # concats the columns if they are the same
category | color | customer | distance | important | sales | |
---|---|---|---|---|---|---|
0 | cat2 | NaN | 101 | NaN | yes | 123 |
1 | cat2 | NaN | 102 | NaN | no | 52 |
2 | cat1 | NaN | 103 | NaN | yes | 214 |
3 | cat3 | NaN | 104 | NaN | yes | 663 |
4 | NaN | yellow | 101 | 12.0 | NaN | 123 |
5 | NaN | green | 103 | 9.0 | NaN | 214 |
6 | NaN | green | 104 | 44.0 | NaN | 663 |
7 | NaN | blue | 105 | 21.0 | NaN | 331 |
pd.concat([df7,df8],axis=1,sort=False) # done by index. Notice the sqare pattern if index is not the same.
customer | category | important | sales | customer | color | distance | sales | |
---|---|---|---|---|---|---|---|---|
0 | 101 | cat2 | yes | 123.0 | NaN | NaN | NaN | NaN |
1 | 102 | cat2 | no | 52.0 | NaN | NaN | NaN | NaN |
2 | 103 | cat1 | yes | 214.0 | NaN | NaN | NaN | NaN |
3 | 104 | cat3 | yes | 663.0 | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | 101 | yellow | 12.0 | 123.0 |
5 | NaN | NaN | NaN | NaN | 103 | green | 9.0 | 214.0 |
6 | NaN | NaN | NaN | NaN | 104 | green | 44.0 | 663.0 |
7 | NaN | NaN | NaN | NaN | 105 | blue | 21.0 | 331.0 |
Merge - combines dataframes using a column's values to identify common entries on = which column are we going to try and find common entries. Must be common betweenthe dataframes. how = which rows do we want to combine. (outer or inner)
outer = union of all similar rows.
inner = intersection of rows between data frames.
pd.merge(df7,df8,how='outer',on='customer') #outer merge is union of on
# Note sales_x and sales_y if they have the same name. Better than what concat was doing
customer | category | important | sales_x | color | distance | sales_y | |
---|---|---|---|---|---|---|---|
0 | 101 | cat2 | yes | 123.0 | yellow | 12.0 | 123.0 |
1 | 102 | cat2 | no | 52.0 | NaN | NaN | NaN |
2 | 103 | cat1 | yes | 214.0 | green | 9.0 | 214.0 |
3 | 104 | cat3 | yes | 663.0 | green | 44.0 | 663.0 |
4 | 105 | NaN | NaN | NaN | blue | 21.0 | 331.0 |
pd.merge(df7,df8,how='inner',on='customer') # inner merge is intersection of on
customer | category | important | sales_x | color | distance | sales_y | |
---|---|---|---|---|---|---|---|
0 | 101 | cat2 | yes | 123 | yellow | 12 | 123 |
1 | 103 | cat1 | yes | 214 | green | 9 | 214 |
2 | 104 | cat3 | yes | 663 | green | 44 | 663 |
pd.merge(df7,df8,how='left',on='customer') # left merge is just first on, but all columns ... right is second
customer | category | important | sales_x | color | distance | sales_y | |
---|---|---|---|---|---|---|---|
0 | 101 | cat2 | yes | 123 | yellow | 12.0 | 123.0 |
1 | 102 | cat2 | no | 52 | NaN | NaN | NaN |
2 | 103 | cat1 | yes | 214 | green | 9.0 | 214.0 |
3 | 104 | cat3 | yes | 663 | green | 44.0 | 663.0 |
pd.merge(df7,df8,how='right',on='customer') # right merge
customer | category | important | sales_x | color | distance | sales_y | |
---|---|---|---|---|---|---|---|
0 | 101 | cat2 | yes | 123.0 | yellow | 12 | 123 |
1 | 103 | cat1 | yes | 214.0 | green | 9 | 214 |
2 | 104 | cat3 | yes | 663.0 | green | 44 | 663 |
3 | 105 | NaN | NaN | NaN | blue | 21 | 331 |
Join - combines dataframes using the index to identify common entries. similar to merge but using index labels instead of using vales from one of the columns.
NOTE: Syntax is a bit different from merge as well.
df9 = pd.DataFrame({'Q1': [101,103,103],
'Q2': [201,202,203]},
index=['I0','I1','I2'])
df10 = pd.DataFrame({'Q3': [301,302,303],
'Q4': [401,402,403]},
index=['I0','I2','I3'])
df9.join(df10,how='outer') # outer, inner, left, and right work the same as merge.
Q1 | Q2 | Q3 | Q4 | |
---|---|---|---|---|
I0 | 101.0 | 201.0 | 301.0 | 401.0 |
I1 | 103.0 | 202.0 | NaN | NaN |
I2 | 103.0 | 203.0 | 302.0 | 402.0 |
I3 | NaN | NaN | 303.0 | 403.0 |
df8['color'].unique() # Find unique color values within a series.
array(['yellow', 'green', 'blue'], dtype=object)
df8['color'].value_counts() #How many times each index label shows up in the series.
green 2 yellow 1 blue 1 Name: color, dtype: int64
df9.mean() # Calculate the mean colum by column function.
Q1 102.333333 Q2 202.000000 dtype: float64
df8.columns # Display Index Labels for Data Frame
Index(['customer', 'color', 'distance', 'sales'], dtype='object')
df8
customer | color | distance | sales | |
---|---|---|---|---|
4 | 101 | yellow | 12 | 123 |
5 | 103 | green | 9 | 214 |
6 | 104 | green | 44 | 663 |
7 | 105 | blue | 21 | 331 |
new_df = df8[(df8['customer']!='105') & (df8['color']!='green')]
new_df
customer | color | distance | sales | |
---|---|---|---|---|
4 | 101 | yellow | 12 | 123 |
print(df8['sales'].mean())
print(df8['distance'].min())
print(df8['sales'].std())
print(df8['sales'].sum())
332.75 9 236.05419575456256 1331
def profit(s):
return s*0.5 # 50% markup..
df8['sales'].apply(profit)
4 61.5 5 107.0 6 331.5 7 165.5 Name: sales, dtype: float64
df8['color'].apply(len) # len is the length of a string. Standard packge funciton.
4 6 5 5 6 5 7 4 Name: color, dtype: int64
df11 = df8[['distance','sales']]
df11.applymap(profit) #applymap only works on dataframes and it works entry by entry. (vs and entire column)
distance | sales | |
---|---|---|
4 | 6.0 | 61.5 |
5 | 4.5 | 107.0 |
6 | 22.0 | 331.5 |
7 | 10.5 | 165.5 |
def col_sum(co):
return sum(co) #Sums Column series
df11.apply(col_sum)
distance 86 sales 1331 dtype: int64
del df8['color'] #Delete column from within a dataframe
df8
customer | distance | sales | |
---|---|---|---|
4 | 101 | 12 | 123 |
5 | 103 | 9 | 214 |
6 | 104 | 44 | 663 |
7 | 105 | 21 | 331 |
df8.index
Int64Index([4, 5, 6, 7], dtype='int64')
df8.sort_values(by='distance') # if you use ,inplace=True) stays taht way.
customer | distance | sales | |
---|---|---|---|
5 | 103 | 9 | 214 |
4 | 101 | 12 | 123 |
7 | 105 | 21 | 331 |
6 | 104 | 44 | 663 |
mydict = {'customer': ['Customer 1','Customer 1','Customer2','Customer2','Customer3','Customer3'],
'product1':[1.1,2.1,3.8,4.2,5.5,6.9],
'product2':[8.2,9.1,11.1,5.2,44.66,983]}
df6 = pd.DataFrame(mydict,index=['Prucahase 1','Purchase 2','Purchase 3','Purchase 4','Purchase 5','Purchase 6'])
df6
customer | product1 | product2 | |
---|---|---|---|
Prucahase 1 | Customer 1 | 1.1 | 8.20 |
Purchase 2 | Customer 1 | 2.1 | 9.10 |
Purchase 3 | Customer2 | 3.8 | 11.10 |
Purchase 4 | Customer2 | 4.2 | 5.20 |
Purchase 5 | Customer3 | 5.5 | 44.66 |
Purchase 6 | Customer3 | 6.9 | 983.00 |
grouped_data = df6.groupby('customer') # like a pivot table by grouping by customer.
print(grouped_data)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb42844a0d0>
grouped_data.describe()
product1 | product2 | |||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
customer | ||||||||||||||||
Customer 1 | 2.0 | 1.6 | 0.707107 | 1.1 | 1.35 | 1.6 | 1.85 | 2.1 | 2.0 | 8.65 | 0.636396 | 8.20 | 8.425 | 8.65 | 8.875 | 9.1 |
Customer2 | 2.0 | 4.0 | 0.282843 | 3.8 | 3.90 | 4.0 | 4.10 | 4.2 | 2.0 | 8.15 | 4.171930 | 5.20 | 6.675 | 8.15 | 9.625 | 11.1 |
Customer3 | 2.0 | 6.2 | 0.989949 | 5.5 | 5.85 | 6.2 | 6.55 | 6.9 | 2.0 | 513.83 | 663.506577 | 44.66 | 279.245 | 513.83 | 748.415 | 983.0 |
df8.to_csv('df8.csv',index=True) # save with the index labels.
new_df8 = pd.read_csv('df8.csv',index_col=0) #index_col is the column to use for index labels.
#If you don't supply it creates a new index.
new_df8
customer | distance | sales | |
---|---|---|---|
4 | 101 | 12 | 123 |
5 | 103 | 9 | 214 |
6 | 104 | 44 | 663 |
7 | 105 | 21 | 331 |
df8.to_excel('df8.xlsx',index=False,sheet_name='first sheet')
newer_df8 = pd.read_excel('df8.xlsx',sheet_name='first sheet')
newer_df8
customer | distance | sales | |
---|---|---|---|
0 | 101 | 12 | 123 |
1 | 103 | 9 | 214 |
2 | 104 | 44 | 663 |
3 | 105 | 21 | 331 |