Introduction to Data Analysis
Prefab
Here is everything you need to know about pandas
.
Environment
The following code is tested on Macbook Air (Apple Silicon). MacOS 12.3.
Anaconda
Refer to THIS.
Virtual environment
1 | conda create -n <env name> python=3.9 |
Jupyter notebook
Refer to THIS
Dependency
1 | numpy |
Basics of Python/Numpy
```python L = [1,2,3,4,5] l = [i if i<=2 else 2 for i in L]
1
2
3
4
2. ```python
# map function & lambda
a = list(map(lambda x, y: str(x)+'_'+y, range(5), list('abcde')))```python # zip L1 = list('abc') L2 = list('edf') L3 = list('ghi') for i, j, k in zip(L1, L2, L3): print(i, j, k)
1
2
3
4
5
6
4. ```python
# enumerate
L = list('abc')
for index, value in enumerate(L):
print(index, value)```python # use zip to create dictionary L1 = list('abc') L2 = list('edf') d = dict(zip(L1, L2))
1
2
3
4
5
6. ```python
# unzip
zipped = list(zip(L1, L2, L3))
original = list(zip(*zipped)) # result is [L1, L2, L3]```python # numpy # create arithemtic sequence [1, 5] including 11 samples a = np.linspace(1, 5, 11)
create arithemtic sequence [1, 5) with stride=2
b = np.arange(1, 5, 2)
create identiy matrix
i = np.eye(3)
create random 3x4 matrix
r = np.random.rand(3, 4) r_uniform = np.random.uniform (5, 15, 3) # uniform distribution (5, 15) r_norm = np.random.normal(3, mu, sigma) # norm distribution with (mu, sigma^2)
random
np.random.seed(2022) randomNumber = np.random.rand()
1
2
3
4
5
6
7
8. ```python
a = np.array([-1, 1, -1, 0])
b = np.where(a>0, 0, a)
c = np.nonzero(a) # return the index of nonzero element
a.argmax() # return the index of maximum element
a.argmin() # return the index of minimum element```python # return correlation matrix target1 = np.array([1,3,5,9]) target2 = np.array([1,5,3,-9]) np.cov(target1, target2)
return correlation coefficient
np.corrcoef(target1, target2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Pandas
## Read & Write Data
1. ```python
# read data
df_csv = pd.read_csv(path)
df_txt = pd.read_table(path)
df_excel = pd.read_excel(path)
# parameters
pd.read_csv(path, header=None) # remove header
pd.read_csv(path, usecols=['col1', 'col2']) # only select certain columns
# for txt file, if separator is not space, you have to specify it.
pd.read(path, spe='/', engine='python')```python # write data df_csv.to_csv(path, index=False) # remove index df_csv.to_csv('data.txt', sep=', index=False) # save to txt file
convert to markdown
df_csv.to_markdown() # convert to latex df_csv.style.to_latex()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
## Data Structure
1. ```python
# Series -- one-dimensional data (index-data)
s = pd.Series(data=[100, 'a', {'a': 100}],
index=pd.Index(['1', 2, 'third'], name='idx_name'),
dtype='object',
name='table_name')
s.values # data
s.index
s.dtypes # for Series, s.dtype works as well
s.name
s.shape # length of data```python # DataFrame -- two-dimensional data (row index + column index) df = pd.DataFrame(data = [[1, 'a', 1.2], [2, 'b', 2.2], [3, 'c', 3.2]], index = ['row_%d'%i for i in range(3)], columns=['col_0', 'col_1', 'col_2']) # select one column -- it is a Series df['col_0'] # select two columns -- it is a DataFrame df[['col_0', 'col_1']]
1
2
3
4
5
6
7
8
9
10
## Basic Functions
1. ```python
# preview data
df.head(N) # display the first N rows
df.tail(N) # display the last N rows
df.info() # return column name, count, dtype and etc
df.describe() # return count, mean, std, min and etc```python df.mean() # return the mean value of each column df.max() # return the max value of each column df.quantile(0.75) # 0.75 quantile of each column df.count() # the number of not Nan elements df.idxmax() # return the index of maximum element of each column df.idxmin() # above all, parameter: axis=0: aggregate each column; axis=1: aggregate each row
1
2
3
4
5
3. ```python
# drop duplicates
df.drop_duplicates(['col_0', 'col_1'], keep='first') # keep the first row of same value in col0 and col1
# keep='first'--keep the first row; 'last'--keep the last row; False--remove all duplicated rows```python # replace df['Gender'].replace({'Female':0, 'Male':1}) # replace 'Female' with 0 and replace 'Male' with 1 df['Gender'].replace(['Female', 'Male'], [0, 1]) # same
1
2
3
4
5
6
7
8
9
10
5. ```python
# replace with conditions
s = pd.Series([-1, 1.2345, 100, -50])
s.where(s<0) # replace the element>=0 with Nan
s.where(s<0, 100) # replace the element>=0 with 100
s.mask(s<0) # replace the element<0 with Nan
s.mask(s<0, -50) # replace the element<0 with -50```python s.round(2) # keep 2 decimals s.abs() # keep absolute value s.clip(0, 2) # set ground and ceiling
1
2
3
4
7. ```python
df.sort_values('col_0', ascending=True) # sort by one column
df.sort_values(['col_0', 'col_1'], ascending=[True, False]) # sort by two columns```python def my_operation(x): return x+1 df.apply(my_operation, axis=0) # apply my_operation on each column df.apply(lambda x: x+1, axis=0) # same # note: apply is not quite efficient
1
2
3
4
5
6
7
8
9
10
11
9. ```python
# rolling window
s = pd.Series([1,2,3,4,5])
roller = s.rolling(window=3) # window size=3, stride=1
roller.mean()
roller.sum()
roller.corr(another_roller) # correlation coefficient
roller.cov(another_roller) # covariance
roller.apply(lambda x:x.mean())python # expanding window s = pd.Series([1, 3, 6, 10]) s.expanding().mean() # [1], [1, 3], [1, 3, 6], [1, 3, 6, 10] # 1, 2, 3.33, 5