Introduction to Data Analysis

Posted on 2022-07-27 Edited on 2023-05-08 In Machine Learning

Prefab

Here is everything you need to know about pandas.

Environment

The following code is tested on Macbook Air (Apple Silicon). MacOS 12.3.

Anaconda

Refer to THIS.

Virtual environment

1
2
3

$ conda create -n <env name> python=3.9
$ conda activate <env name>
$ conda install pandas

Jupyter notebook

Refer to THIS

Dependency

numpy
pandas
xlrd
xlwt
openpyxl
tabulate

Basics of Python/Numpy

```python L = [1,2,3,4,5] l = [i if i<=2 else 2 for i in L]


2. ```python
   # map function & lambda
   a = list(map(lambda x, y: str(x)+'_'+y, range(5), list('abcde')))

```python # zip L1 = list('abc') L2 = list('edf') L3 = list('ghi') for i, j, k in zip(L1, L2, L3): print(i, j, k)


4. ```python
   # enumerate
   L = list('abc')
   for index, value in enumerate(L):
   	print(index, value)

```python # use zip to create dictionary L1 = list('abc') L2 = list('edf') d = dict(zip(L1, L2))


6. ```python
   # unzip
   zipped = list(zip(L1, L2, L3))
   original = list(zip(*zipped)) # result is [L1, L2, L3]

```python # numpy # create arithemtic sequence [1, 5] including 11 samples a = np.linspace(1, 5, 11)

create arithemtic sequence [1, 5) with stride=2

b = np.arange(1, 5, 2)

create identiy matrix

i = np.eye(3)

create random 3x4 matrix

r = np.random.rand(3, 4) r_uniform = np.random.uniform (5, 15, 3) # uniform distribution (5, 15) r_norm = np.random.normal(3, mu, sigma) # norm distribution with (mu, sigma^2)

random

np.random.seed(2022) randomNumber = np.random.rand()


8. ```python
   a = np.array([-1, 1, -1, 0])
   b = np.where(a>0, 0, a)
   c = np.nonzero(a) # return the index of nonzero element
   a.argmax() # return the index of maximum element
   a.argmin() # return the index of minimum element

```python # return correlation matrix target1 = np.array([1,3,5,9]) target2 = np.array([1,5,3,-9]) np.cov(target1, target2)

return correlation coefficient

np.corrcoef(target1, target2)


# Pandas

## Read & Write Data

1. ```python
   # read data
   df_csv = pd.read_csv(path)
   df_txt = pd.read_table(path)
   df_excel = pd.read_excel(path)
   
   # parameters
   pd.read_csv(path, header=None) # remove header
   pd.read_csv(path, usecols=['col1', 'col2']) # only select certain columns
   
   # for txt file, if separator is not space, you have to specify it.
   pd.read(path, spe='/', engine='python')

```python # write data df_csv.to_csv(path, index=False) # remove index df_csv.to_csv('data.txt', sep=', index=False) # save to txt file

convert to markdown

df_csv.to_markdown() # convert to latex df_csv.style.to_latex()


## Data Structure

1. ```python
   # Series -- one-dimensional data (index-data)
   s = pd.Series(data=[100, 'a', {'a': 100}],
                 index=pd.Index(['1', 2, 'third'], name='idx_name'),
                 dtype='object',
                 name='table_name')
   s.values # data
   s.index
   s.dtypes # for Series, s.dtype works as well
   s.name
   s.shape # length of data

```python # DataFrame -- two-dimensional data (row index + column index) df = pd.DataFrame(data = [[1, 'a', 1.2], [2, 'b', 2.2], [3, 'c', 3.2]], index = ['row_%d'%i for i in range(3)], columns=['col_0', 'col_1', 'col_2']) # select one column -- it is a Series df['col_0'] # select two columns -- it is a DataFrame df[['col_0', 'col_1']]


## Basic Functions

1. ```python
   # preview data
   df.head(N) # display the first N rows
   df.tail(N) # display the last N rows
   
   df.info() # return column name, count, dtype and etc
   df.describe() # return count, mean, std, min and etc

```python df.mean() # return the mean value of each column df.max() # return the max value of each column df.quantile(0.75) # 0.75 quantile of each column df.count() # the number of not Nan elements df.idxmax() # return the index of maximum element of each column df.idxmin() # above all, parameter: axis=0: aggregate each column; axis=1: aggregate each row


3. ```python
   # drop duplicates
   df.drop_duplicates(['col_0', 'col_1'], keep='first') # keep the first row of same value in col0 and col1
   # keep='first'--keep the first row; 'last'--keep the last row; False--remove all duplicated rows

```python # replace df['Gender'].replace({'Female':0, 'Male':1}) # replace 'Female' with 0 and replace 'Male' with 1 df['Gender'].replace(['Female', 'Male'], [0, 1]) # same


5. ```python
   # replace with conditions
   s = pd.Series([-1, 1.2345, 100, -50])
   
   s.where(s<0) # replace the element>=0 with Nan
   s.where(s<0, 100) # replace the element>=0 with 100
   
   s.mask(s<0) # replace the element<0 with Nan
   s.mask(s<0, -50) # replace the element<0 with -50

```python s.round(2) # keep 2 decimals s.abs() # keep absolute value s.clip(0, 2) # set ground and ceiling


7. ```python
   df.sort_values('col_0', ascending=True) # sort by one column
   df.sort_values(['col_0', 'col_1'], ascending=[True, False]) # sort by two columns

```python def my_operation(x): return x+1 df.apply(my_operation, axis=0) # apply my_operation on each column df.apply(lambda x: x+1, axis=0) # same # note: apply is not quite efficient


9. ```python
   # rolling window
   s = pd.Series([1,2,3,4,5])
   roller = s.rolling(window=3) # window size=3, stride=1
   
   roller.mean()
   roller.sum()
   roller.corr(another_roller) # correlation coefficient
   roller.cov(another_roller) # covariance
   roller.apply(lambda x:x.mean())

python # expanding window s = pd.Series([1, 3, 6, 10]) s.expanding().mean() # [1], [1, 3], [1, 3, 6], [1, 3, 6, 10] # 1, 2, 3.33, 5