neural-networks.io

neural-networks.io

DataFrames in Python



In the following, we'll assume pandas and NumPy are respectively imported as pd and np:

import pandas as pd
import numpy as np
 

# Create DataFrames

 
# Create an empty DataFrame
>>> df = pd.DataFrame()
>>> df
Empty DataFrame
Columns: []
Index: []
 
# Create an empty DataFrame with column names (i.e. header)
>>> df = pd.DataFrame(columns = ['C1', 'C2', 'C3'])
>>> df
Empty DataFrame
Columns: [C1, C2, C3]
Index: []
 
# Create an empty DataFrame with row names (i.e. index)
>>> df = pd.DataFrame(index = ['R1', 'R2', 'R3', 'R4'])
>>> df
Empty DataFrame
Columns: []
Index: [R1, R2, R3, R4]
 
# Create an empty DataFrame with row (i.e. index) and column names (i.e. header)
>>> df = pd.DataFrame(index = ['R1', 'R2', 'R3', 'R4'], columns = ['C1', 'C2', 'C3'])
>>> df
     C1   C2   C3
R1  NaN  NaN  NaN
R2  NaN  NaN  NaN
R3  NaN  NaN  NaN
R4  NaN  NaN  NaN
 

# Populate DataFrames

 
# Create a DataFrame initialized with a given value
>>> df = pd.DataFrame([['myValue']*4]*3)
>>> df
         0        1        2        3
0  myValue  myValue  myValue  myValue
1  myValue  myValue  myValue  myValue
2  myValue  myValue  myValue  myValue
 
# Create and populate a DataFrame
>>> df = pd.DataFrame ( [ [1,2,3] , [4,5,6] ] )
>>> df
   0  1  2
0  1  2  3
1  4  5  6
 
# Create and populate a DataFrame from a list
>>> myList=[ [1,2,3] , [4,5,6] ]
>>> df = pd.DataFrame(myList)
>>> df
   0  1  2
0  1  2  3
1  4  5  6
 
# Create and populate a DataFrame from a dictionnary
>>> dict={'C1': [1, 4], 'C2': [2, 5], 'C3': [3, 6] }
>>> df = pd.DataFrame(dict)
>>> df
   C1  C2  C3
0   1   2   3
1   4   5   6
 
# Create and populate a DataFrame from a NumPy array
>>> import numpy as np
>>> ar=np.array([ [1,2,3] , [4,5,6] ])
>>> df = pd.DataFrame(ar)
>>> df
   0  1  2
0  1  2  3
1  4  5  6
 

# Exploring data

 
# Get the number of rows
>>> df = pd.DataFrame(np.random.randn(26, 3))
>>> len(df)
26
# Display the 5 first rows of a DataFrame
>>> df = pd.DataFrame(np.random.randn(20, 3))
>>> df.head()
          0         1         2
0  0.015568 -0.800550  0.995735
1 -0.442080 -0.468169 -0.037934
2 -0.056612  0.602909  0.925391
3 -0.026737 -0.995818 -0.971497
4  2.087257 -0.499205  0.523772
 
# Display the N first rows of a DataFrame
>>> df = pd.DataFrame(np.random.randn(20, 3))
>>> df.head(7)
          0         1         2
0  0.677265 -0.258972 -0.952009
1 -0.386076  0.045306  0.032096
2 -1.563352 -1.878010 -0.464059
3  0.406500  0.342614 -0.138983
4 -0.122513  1.020732 -0.943917
5 -1.521629 -1.247523  0.505387
6 -0.049314  0.507946 -0.543623
 
# Display the last 5 rows of a DataFrame
>>> df = pd.DataFrame(np.random.randn(20, 3))
>>> df.tail()
           0         1         2
15  0.104621 -1.089801 -1.115315
16 -1.211985  1.725402 -0.045163
17  0.346928 -0.659337 -0.337814
18 -0.495076 -0.183246 -1.468753
19 -2.023775  0.108744  0.299338
 
# Display the last N rows of a DataFrame
>>> df = pd.DataFrame(np.random.randn(20, 3))
>>> df.tail(3)
           0         1         2
17  1.240378  1.526510  1.443443
18 -0.374890  1.472440 -0.122570
19 -1.431079 -0.782603 -1.258807
 
# Display a random sample of N rows
>>> df = pd.DataFrame(np.random.randn(20, 3))
>>> df.sample(3)
           0         1         2
2   0.648810 -0.118470  0.674101
17  0.982777 -1.372479  0.532249
15  2.257330 -1.118517 -0.880213
 
# Number of rows/columns in a tuple
>>> df = pd.DataFrame(np.random.randn(20, 3))
>>> df.shape
(20, 3)
 
# Calculate measures of tendency
>>> df = pd.DataFrame(np.random.randn(20, 3))
>>> df.describe()
               0          1          2
count  20.000000  20.000000  20.000000
mean   -0.073553   0.082187  -0.119492
std     1.025429   1.063697   0.999306
min    -2.046268  -1.323986  -1.759392
25%    -0.755839  -0.697858  -0.962823
50%     0.072743  -0.239909  -0.225349
75%     0.555210   0.745141   0.268392
max     1.932597   1.933956   1.952840
 
# Concise summary of a DataFrame
>>> df = pd.DataFrame(np.random.randn(20, 3))
>>> df.info()

Int64Index: 20 entries, 0 to 19
Data columns (total 3 columns):
0    20 non-null float64
1    20 non-null float64
2    20 non-null float64
dtypes: float64(3)
memory usage: 640.0 bytes
 

# Indexing and selecting single cell

 
# Select a specific cell by row/columns
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)) , columns=['A','B','C'] )
>>> df.ix[2,'B']
7
or:
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)) , columns=['A','B','C'] )
>>> df.B.ix[2]
7
 

# Indexing and selecting single column

 
# Select a single column by label
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)) , columns=['A','B','C'] )
>>> df['C']
0     2
1     5
2     8
3    11
4    14
Name: C, dtype: int64
 
# Select a single column by index
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)) , columns=['A','B','C'] )
>>> df[df.columns[2]]
0     2
1     5
2     8
3    11
4    14
Name: C, dtype: int64
or:
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)) , columns=['A','B','C'] )
>>> df.ix[:,2]
0     2
1     5
2     8
3    11
4    14
Name: C, dtype: int64
or:
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)) , columns=['A','B','C'] )
>>> df.iloc[:,2]
0     2
1     5
2     8
3    11
4    14
Name: C, dtype: int64
 

# Selecting single row

 
# Select a single row by index
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)) , index = ['r1','r2','r3','r4','r5'])
>>> df.iloc[2]
0    6
1    7
2    8
Name: r3, dtype: int64
or:
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)) , index = ['r1','r2','r3','r4','r5'])
>>> df.ix[2]
0    6
1    7
2    8
Name: r3, dtype: int64
 
# Select a single row by label
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)) , index = ['r1','r2','r3','r4','r5'])
>>> df.loc['r2']
0    3
1    4
2    5
Name: r2, dtype: int64
or:
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)) , index = ['r1','r2','r3','r4','r5'])
>>> df.ix['r2']
0    3
1    4
2    5
Name: r2, dtype: int64
 

# Selecting multiple rows and columns

 
# Select multiple rows and columns by index
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)), index = ['r1','r2','r3','r4','r5'], columns=['A', 'B', 'C'])
>>> df.iloc[2:4, 1:3]
     B   C
r3   7   8
r4  10  11
or:
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)), index = ['r1','r2','r3','r4','r5'], columns=['A', 'B', 'C'])
>>> df.ix[2:4, 1:3]
     B   C
r3   7   8
r4  10  11
 
# Select multiple rows and columns by label
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)), index = ['r1','r2','r3','r4','r5'], columns=['A', 'B', 'C'])
>>> df.loc[['r2', 'r3', 'r4'], ['B', 'C']]
     B   C
r2   4   5
r3   7   8
r4  10  11
or:
>>> df = pd.DataFrame( np.arange(15).reshape((5, 3)), index = ['r1','r2','r3','r4','r5'], columns=['A', 'B', 'C'])
>>> df.ix[['r2', 'r3', 'r4'], ['B', 'C']]
     B   C
r2   4   5
r3   7   8
r4  10  11
 

# Iterating over rows

 
# Iterate over rows
>>> df = pd.DataFrame(np.random.randn(7, 3), columns=['A','B','C'])
>>> for index in df.index:
...     print ("df[" + str(index) + "]['B']=" + str(df['B'][index]))
... 
df[0]['B']=0.283954993534
df[1]['B']=-0.133365147072
df[2]['B']=-0.818946302448
df[3]['B']=0.268719821998
df[4]['B']=0.502778137428
df[5]['B']=1.63455639172
df[6]['B']=-0.267843736515
 
# Iterate over rows with iterrows
>>> df = pd.DataFrame(np.random.randn(7, 3), columns=['A','B','C'])
>>> for index, row in df.iterrows():
...     print ("df[" + str(index) + "]['B']=" + str(row['B']))
... 
df[0]['B']=0.527559779712
df[1]['B']=0.430168409951
df[2]['B']=0.451917861905
df[3]['B']=0.439450997295
df[4]['B']=-0.786818697065
df[5]['B']=0.760795411371
df[6]['B']=0.871493527713
 
# Iterate over rows with itertuples
>>> df = pd.DataFrame(np.random.randn(7, 3), columns=['A','B','C'])
>>> for row in df.itertuples():
...     print ("df[" + str(row.Index) + "]['B']=" + str(row.B))
... 
df[0]['B']=-1.768612347340564
df[1]['B']=-0.35989018257067396
df[2]['B']=-0.6307394564694447
df[3]['B']=-0.8745804495249828
df[4]['B']=-0.882188475278571
df[5]['B']=-1.618288671196083
df[6]['B']=1.2159031714168103
 

# Save DataFrames into file

 
# Save a DataFrame into a csv file
>>> df=pd.DataFrame ([[1, 2, 3],[4, 5, 6]] , index = ['R1', 'R2'], columns = ['C1', 'C2', 'C3'])
>>> df.to_csv('filename.csv')
>>> 
$ cat filename.csv 
,C1,C2,C3
R1,1,2,3
R2,4,5,6
 
# Save a DataFrame into a csv file omiting column names (without header)
>>> df=pd.DataFrame ([[1, 2, 3],[4, 5, 6]] , index = ['R1', 'R2'], columns = ['C1', 'C2', 'C3'])
>>> df.to_csv('filename.csv', header = False)
>>> 
$ cat filename.csv 
R1,1,2,3
R2,4,5,6
 
# Save a DataFrame into a csv file omiting row names (without index)
>>> df=pd.DataFrame ([[1, 2, 3],[4, 5, 6]] , index = ['R1', 'R2'], columns = ['C1', 'C2', 'C3'])
>>> df.to_csv('filename.csv', index = False)
>>> 
$ cat filename.csv 
C1,C2,C3
1,2,3
4,5,6
 
# Save a DataFrame into a tsv (Tabulation-Separated) file
>>> df=pd.DataFrame ([[1, 2, 3],[4, 5, 6]] , index = ['R1', 'R2'], columns = ['C1', 'C2', 'C3'])
>>> df.to_csv('filename.tsv', sep="\t")
>>> 
$ cat filename.tsv 
	C1	C2	C3
R1	1	2	3
R2	4	5	6
 
# Save a DataFrame into a csv file with a random separator
>>> df=pd.DataFrame ([[1, 2, 3],[4, 5, 6]] , index = ['R1', 'R2'], columns = ['C1', 'C2', 'C3'])
>>> df.to_csv('filename.csv', sep="+")
>>> 
$ cat filename.csv 
+C1+C2+C3
R1+1+2+3
R2+4+5+6
 

# Read DataFrames from file

 
# Load DataFrame from a csv file
$ cat filename.csv 
C1,C2,C3
1,2,3
4,5,6
>>> df = pd.read_csv('filename.csv')
>>> df
   C1  C2  C3
0   1   2   3
1   4   5   6
 
# Load DataFrame from a csv file without column names (i.e. header)
$ cat filename.csv 
1,2,3
4,5,6
>>> df=pd.read_csv('filename.csv', header=None)
>>> df
   0  1  2
0  1  2  3
1  4  5  6
 
# Load DataFrame from a csv file with first column as row names (i.e. index)
$ cat filename.csv 
,C1,C2,C3
R0,1,2,3
R1,4,5,6
>>> df=pd.read_csv('filename.csv', index_col = 0)
>>> df
    C1  C2  C3
R0   1   2   3
R1   4   5   6
 
# Load DataFrame from a tsv (Tabulation-Separated) file
$ cat filename.tsv 
C1	C2	C3
1	2	3
4	5	6
>>> df=pd.read_csv('filename.tsv', sep="\t")
>>> df
   C1  C2  C3
0   1   2   3
1   4   5   6
 
# Load DataFrame from a csv file with random separator
$ cat filename.csv 
C1+C2+C3
1+2+3
4+5+6
>>> df=pd.read_csv('filename.csv', sep="+")
>>> df
   C1  C2  C3
0   1   2   3
1   4   5   6