import pandas as pd
import numpy as np


mylist = [5.4,6.1,1.7,99.8]
myarray = np.array(mylist)


myseries1 = pd.Series(data=mylist)
print(myseries1)
myseries2= pd.Series(data=myarray)
print(myseries2)

0     5.4
1     6.1
2     1.7
3    99.8
dtype: float64
0     5.4
1     6.1
2     1.7
3    99.8
dtype: float64


print(myseries1[2]) # like arrays can access individual entries

1.7


mylabels=['first','second','third','fourth']
myseries3 = pd.Series(data=mylist,index=mylabels) #add labels to entries in a series
print(myseries3)

first      5.4
second     6.1
third      1.7
fourth    99.8
dtype: float64


myseries4 = pd.Series(mylist,mylabels) #NOTE: you do not have to be explicit about the entries of pd.Series
print(myseries4)

first      5.4
second     6.1
third      1.7
fourth    99.8
dtype: float64


print(myseries4['second']) #Access entries using the index labels

6.1


# math on series
myseries5 = pd.Series([5.5,1.1,8.8,1.6],['first','third','fourth','fifth'])
print(myseries4)
print('+')
print(myseries5)
print('__________________')
print(myseries5+myseries4)

first      5.4
second     6.1
third      1.7
fourth    99.8
dtype: float64
+
first     5.5
third     1.1
fourth    8.8
fifth     1.6
dtype: float64
__________________
fifth       NaN
first      10.9
fourth    108.6
second      NaN
third       2.8
dtype: float64


df1 = pd.concat([myseries4,myseries5],axis=1,sort=False) 
#axis=1 says make multiple columns axis=0 rows
#sort=Alphabetical sort?
df1


df2 = pd.DataFrame(np.random.randn(5,5)) # Create new dataframe
df2


df3 = pd.DataFrame(np.random.randn(5,5),index=['first row','second row','third row','fourth row','fifth row'],
                   columns=['first col','second col','third col','fourth col','fifth col'])
df3


# Access individual series in a data frame
print(df3['second col']) # NOTE can only acces column like this. Not row.
print('')
df3[['third col','first col']] # two columns together are a dataframe

first row    -0.200170
second row    0.195434
third row     0.002575
fourth row   -0.593658
fifth row     0.050926
Name: second col, dtype: float64


# Access rows of a dataframe
df3.loc['fourth row']

first col    -0.009804
second col   -0.593658
third col     0.535864
fourth col   -1.080070
fifth col    -0.648657
Name: fourth row, dtype: float64


df3.iloc[2] # Access via row index number

first col     0.687090
second col    0.002575
third col    -0.793467
fourth col   -0.518523
fifth col     0.029827
Name: third row, dtype: float64


df3.loc[['fourth row','first row'],['second col','third col']]


# logical indexing for dataframes
df3>0


print(df3[df3>0])

            first col  second col  third col  fourth col  fifth col
first row         NaN         NaN   1.243514    0.827881        NaN
second row        NaN    0.195434        NaN    0.782769   1.194898
third row     0.68709    0.002575        NaN         NaN   0.029827
fourth row        NaN         NaN   0.535864         NaN        NaN
fifth row         NaN    0.050926        NaN         NaN   1.402452


# Add columsn to a dataframe
df3['sixth col'] = np.random.randn(5,1)
df3


df3.drop('first col',axis=1)# No showing first col but still there.


# Proof
df3


df4 = df3.drop('first col',axis=1)
df4


df5 = df3.drop('second row', axis=0)
df5


df5.reset_index() # Removes Index Labels and creats a new column with values from the previous Index. Now a series.


df5 # still what it was before


df5.reset_index(inplace=True)
df5


# Assign new names to the index
df5['new name'] = ['This','is','the','row']
df5


df5.set_index('new name',inplace=True)
df5


df5.mean()

first col    -0.795113
second col   -0.185082
third col    -0.135954
fourth col   -0.203172
fifth col    -0.266784
sixth col     0.277179
dtype: float64


#if some series has multiple of the same value then we can group all the unique entries together
mydict = ({'customer': ['Customer 1','Customer 1','Customer 2', 'Customer 2','Customer 3','Customer 3'], 
          'product1': [1.1,2.1,3.8,4.2,5.5,6.9],
          'product2': [8.2,9.1,11.1,5.2,44.66,983]})
df6 = pd.DataFrame(mydict,index=['Purchase 1','Purchase 2','Purchase 3','Purchase 4', 'Purchase 5','Purchase 6'])
df6


df7 = pd.DataFrame({"customer":['101','102','103','104'],
                    'category':['cat2','cat2','cat1','cat3'],
                   'important': ['yes','no','yes','yes'],
                   'sales': [123,52,214,663]},index=[0,1,2,3])

df8 = pd.DataFrame({"customer":['101','103','104','105'],
                   'color': ['yellow','green','green','blue'],
                             'distance':[12,9,44,21],
                             'sales':[123,214,663,331]},index=[4,5,6,7]) #change index values


pd.concat([df7,df8],axis=0,sort=False) # Concat by Row. Stack on top of each other.


pd.concat([df7,df8],axis=0,sort=True) # concats the columns if they are the same


pd.concat([df7,df8],axis=1,sort=False) # done by index. Notice the sqare pattern if index is not the same.


pd.merge(df7,df8,how='outer',on='customer') #outer merge is union of on
# Note sales_x and sales_y if they have the same name. Better than what concat was doing


pd.merge(df7,df8,how='inner',on='customer') # inner merge is intersection of on


pd.merge(df7,df8,how='left',on='customer') # left merge is just first on, but all columns ... right is second


pd.merge(df7,df8,how='right',on='customer') # right merge


df9 = pd.DataFrame({'Q1': [101,103,103],
                    'Q2': [201,202,203]},
                   index=['I0','I1','I2'])

df10 = pd.DataFrame({'Q3': [301,302,303],
                     'Q4': [401,402,403]},
                    index=['I0','I2','I3'])


df9.join(df10,how='outer') # outer, inner, left, and right work the same as merge.


df8['color'].unique() # Find unique color values within a series.

array(['yellow', 'green', 'blue'], dtype=object)


df8['color'].value_counts() #How many times each index label shows up in the series.

green     2
yellow    1
blue      1
Name: color, dtype: int64


df9.mean() # Calculate the mean colum by column function.

Q1    102.333333
Q2    202.000000
dtype: float64


df8.columns # Display Index Labels for Data Frame

Index(['customer', 'color', 'distance', 'sales'], dtype='object')

df8


new_df = df8[(df8['customer']!='105') & (df8['color']!='green')]
new_df


print(df8['sales'].mean())
print(df8['distance'].min())
print(df8['sales'].std())
print(df8['sales'].sum())

332.75
9
236.05419575456256
1331


def profit(s):
        return s*0.5 # 50% markup..


df8['sales'].apply(profit)

4     61.5
5    107.0
6    331.5
7    165.5
Name: sales, dtype: float64


df8['color'].apply(len) # len is the length of a string. Standard packge funciton.

4    6
5    5
6    5
7    4
Name: color, dtype: int64


df11 = df8[['distance','sales']]
df11.applymap(profit) #applymap only works on dataframes and it works entry by entry. (vs and entire column)


def col_sum(co):
        return sum(co) #Sums Column series
df11.apply(col_sum)

distance      86
sales       1331
dtype: int64


del df8['color'] #Delete column from within a dataframe
df8


df8.index

Int64Index([4, 5, 6, 7], dtype='int64')


df8.sort_values(by='distance') # if you use ,inplace=True) stays taht way.


mydict = {'customer': ['Customer 1','Customer 1','Customer2','Customer2','Customer3','Customer3'],
                    'product1':[1.1,2.1,3.8,4.2,5.5,6.9],
                    'product2':[8.2,9.1,11.1,5.2,44.66,983]}
df6 = pd.DataFrame(mydict,index=['Prucahase 1','Purchase 2','Purchase 3','Purchase 4','Purchase 5','Purchase 6'])
df6


grouped_data = df6.groupby('customer') # like a pivot table by grouping by customer.
print(grouped_data)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb42844a0d0>


grouped_data.describe()


df8.to_csv('df8.csv',index=True) # save with the index labels.


new_df8 = pd.read_csv('df8.csv',index_col=0) #index_col is the column to use for index labels. 
                                             #If you don't supply it creates a new index.
new_df8


df8.to_excel('df8.xlsx',index=False,sheet_name='first sheet')
newer_df8 = pd.read_excel('df8.xlsx',sheet_name='first sheet')
newer_df8

	0	1	2	3	4
0	-0.950540	1.590717	-0.447912	0.244753	0.258761
1	0.711849	-1.604905	0.006943	0.867151	1.645310
2	1.830404	-0.054315	1.313139	0.768555	1.310385
3	-1.132748	-0.333121	1.040200	-0.003932	0.712933
4	-1.235484	-1.019824	-0.172312	0.665532	-1.287178

	product1								product2
	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
customer
Customer 1	2.0	1.6	0.707107	1.1	1.35	1.6	1.85	2.1	2.0	8.65	0.636396	8.20	8.425	8.65	8.875	9.1
Customer2	2.0	4.0	0.282843	3.8	3.90	4.0	4.10	4.2	2.0	8.15	4.171930	5.20	6.675	8.15	9.625	11.1
Customer3	2.0	6.2	0.989949	5.5	5.85	6.2	6.55	6.9	2.0	513.83	663.506577	44.66	279.245	513.83	748.415	983.0

Pandas¶

Labels to Entries of a Series¶

Dataframes¶

Accessing and Modifying Dataframes¶

Add and Drop Columns or Rows¶

Remove Dataframe index labels¶

Mean function¶

Grouping¶

Combining data frames¶

Concat¶

Merge¶

Join¶

Additional Functionalities of pandas¶

Saving Data Frames to CSV and Excel¶

	first col	second col	third col	fourth col	fifth col
first row	-1.586390	-0.200170	1.243514	0.827881	-1.850758
second row	-0.780710	0.195434	-1.698385	0.782769	1.194898
third row	0.687090	0.002575	-0.793467	-0.518523	0.029827
fourth row	-0.009804	-0.593658	0.535864	-1.080070	-0.648657
fifth row	-2.271348	0.050926	-1.529726	-0.041977	1.402452

	first col	second col	third col	fourth col	fifth col
first row	False	False	True	True	False
second row	False	True	False	True	True
third row	True	True	False	False	True
fourth row	False	False	True	False	False
fifth row	False	True	False	False	True

	index	first col	second col	third col	fourth col	fifth col	sixth col
new name
This	first row	-1.586390	-0.200170	1.243514	0.827881	-1.850758	0.840793
is	third row	0.687090	0.002575	-0.793467	-0.518523	0.029827	1.069176
the	fourth row	-0.009804	-0.593658	0.535864	-1.080070	-0.648657	-1.014823
row	fifth row	-2.271348	0.050926	-1.529726	-0.041977	1.402452	0.213570

	customer	product1	product2
Purchase 1	Customer 1	1.1	8.20
Purchase 2	Customer 1	2.1	9.10
Purchase 3	Customer 2	3.8	11.10
Purchase 4	Customer 2	4.2	5.20
Purchase 5	Customer 3	5.5	44.66
Purchase 6	Customer 3	6.9	983.00

	customer	category	important	sales	color	distance
0	101	cat2	yes	123	NaN	NaN
1	102	cat2	no	52	NaN	NaN
2	103	cat1	yes	214	NaN	NaN
3	104	cat3	yes	663	NaN	NaN
4	101	NaN	NaN	123	yellow	12.0
5	103	NaN	NaN	214	green	9.0
6	104	NaN	NaN	663	green	44.0
7	105	NaN	NaN	331	blue	21.0

	customer	category	important	sales	customer	color	distance	sales
0	101	cat2	yes	123.0	NaN	NaN	NaN	NaN
1	102	cat2	no	52.0	NaN	NaN	NaN	NaN
2	103	cat1	yes	214.0	NaN	NaN	NaN	NaN
3	104	cat3	yes	663.0	NaN	NaN	NaN	NaN
4	NaN	NaN	NaN	NaN	101	yellow	12.0	123.0
5	NaN	NaN	NaN	NaN	103	green	9.0	214.0
6	NaN	NaN	NaN	NaN	104	green	44.0	663.0
7	NaN	NaN	NaN	NaN	105	blue	21.0	331.0

	Q1	Q2	Q3	Q4
I0	101.0	201.0	301.0	401.0
I1	103.0	202.0	NaN	NaN
I2	103.0	203.0	302.0	402.0
I3	NaN	NaN	303.0	403.0

	customer	product1	product2
Prucahase 1	Customer 1	1.1	8.20
Purchase 2	Customer 1	2.1	9.10
Purchase 3	Customer2	3.8	11.10
Purchase 4	Customer2	4.2	5.20
Purchase 5	Customer3	5.5	44.66
Purchase 6	Customer3	6.9	983.00