Introduction to Data Analysis

Prefab

Here is everything you need to know about pandas.

Environment

The following code is tested on Macbook Air (Apple Silicon). MacOS 12.3.

Anaconda

Refer to THIS.

Virtual environment

1
2
3
$ conda create -n <env name> python=3.9
$ conda activate <env name>
$ conda install pandas

Jupyter notebook

Refer to THIS

Dependency

1
2
3
4
5
6
numpy
pandas
xlrd
xlwt
openpyxl
tabulate

Basics of Python/Numpy

  1. ```python L = [1,2,3,4,5] l = [i if i<=2 else 2 for i in L]

    1
    2
    3
    4

    2. ```python
    # map function & lambda
    a = list(map(lambda x, y: str(x)+'_'+y, range(5), list('abcde')))

  2. ```python # zip L1 = list('abc') L2 = list('edf') L3 = list('ghi') for i, j, k in zip(L1, L2, L3): print(i, j, k)

    1
    2
    3
    4
    5
    6

    4. ```python
    # enumerate
    L = list('abc')
    for index, value in enumerate(L):
    print(index, value)

  3. ```python # use zip to create dictionary L1 = list('abc') L2 = list('edf') d = dict(zip(L1, L2))

    1
    2
    3
    4
    5

    6. ```python
    # unzip
    zipped = list(zip(L1, L2, L3))
    original = list(zip(*zipped)) # result is [L1, L2, L3]

  4. ```python # numpy # create arithemtic sequence [1, 5] including 11 samples a = np.linspace(1, 5, 11)

    create arithemtic sequence [1, 5) with stride=2

    b = np.arange(1, 5, 2)

    create identiy matrix

    i = np.eye(3)

    create random 3x4 matrix

    r = np.random.rand(3, 4) r_uniform = np.random.uniform (5, 15, 3) # uniform distribution (5, 15) r_norm = np.random.normal(3, mu, sigma) # norm distribution with (mu, sigma^2)

    random

    np.random.seed(2022) randomNumber = np.random.rand()

    1
    2
    3
    4
    5
    6
    7

    8. ```python
    a = np.array([-1, 1, -1, 0])
    b = np.where(a>0, 0, a)
    c = np.nonzero(a) # return the index of nonzero element
    a.argmax() # return the index of maximum element
    a.argmin() # return the index of minimum element

  5. ```python # return correlation matrix target1 = np.array([1,3,5,9]) target2 = np.array([1,5,3,-9]) np.cov(target1, target2)

    return correlation coefficient

    np.corrcoef(target1, target2)

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17

    # Pandas

    ## Read & Write Data

    1. ```python
    # read data
    df_csv = pd.read_csv(path)
    df_txt = pd.read_table(path)
    df_excel = pd.read_excel(path)

    # parameters
    pd.read_csv(path, header=None) # remove header
    pd.read_csv(path, usecols=['col1', 'col2']) # only select certain columns

    # for txt file, if separator is not space, you have to specify it.
    pd.read(path, spe='/', engine='python')

  6. ```python # write data df_csv.to_csv(path, index=False) # remove index df_csv.to_csv('data.txt', sep=', index=False) # save to txt file

    convert to markdown

    df_csv.to_markdown() # convert to latex df_csv.style.to_latex()

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14

    ## Data Structure

    1. ```python
    # Series -- one-dimensional data (index-data)
    s = pd.Series(data=[100, 'a', {'a': 100}],
    index=pd.Index(['1', 2, 'third'], name='idx_name'),
    dtype='object',
    name='table_name')
    s.values # data
    s.index
    s.dtypes # for Series, s.dtype works as well
    s.name
    s.shape # length of data

  7. ```python # DataFrame -- two-dimensional data (row index + column index) df = pd.DataFrame(data = [[1, 'a', 1.2], [2, 'b', 2.2], [3, 'c', 3.2]], index = ['row_%d'%i for i in range(3)], columns=['col_0', 'col_1', 'col_2']) # select one column -- it is a Series df['col_0'] # select two columns -- it is a DataFrame df[['col_0', 'col_1']]

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10

    ## Basic Functions

    1. ```python
    # preview data
    df.head(N) # display the first N rows
    df.tail(N) # display the last N rows

    df.info() # return column name, count, dtype and etc
    df.describe() # return count, mean, std, min and etc

  8. ```python df.mean() # return the mean value of each column df.max() # return the max value of each column df.quantile(0.75) # 0.75 quantile of each column df.count() # the number of not Nan elements df.idxmax() # return the index of maximum element of each column df.idxmin() # above all, parameter: axis=0: aggregate each column; axis=1: aggregate each row

    1
    2
    3
    4
    5

    3. ```python
    # drop duplicates
    df.drop_duplicates(['col_0', 'col_1'], keep='first') # keep the first row of same value in col0 and col1
    # keep='first'--keep the first row; 'last'--keep the last row; False--remove all duplicated rows

  9. ```python # replace df['Gender'].replace({'Female':0, 'Male':1}) # replace 'Female' with 0 and replace 'Male' with 1 df['Gender'].replace(['Female', 'Male'], [0, 1]) # same

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10

    5. ```python
    # replace with conditions
    s = pd.Series([-1, 1.2345, 100, -50])

    s.where(s<0) # replace the element>=0 with Nan
    s.where(s<0, 100) # replace the element>=0 with 100

    s.mask(s<0) # replace the element<0 with Nan
    s.mask(s<0, -50) # replace the element<0 with -50

  10. ```python s.round(2) # keep 2 decimals s.abs() # keep absolute value s.clip(0, 2) # set ground and ceiling

    1
    2
    3
    4

    7. ```python
    df.sort_values('col_0', ascending=True) # sort by one column
    df.sort_values(['col_0', 'col_1'], ascending=[True, False]) # sort by two columns

  11. ```python def my_operation(x): return x+1 df.apply(my_operation, axis=0) # apply my_operation on each column df.apply(lambda x: x+1, axis=0) # same # note: apply is not quite efficient

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11

    9. ```python
    # rolling window
    s = pd.Series([1,2,3,4,5])
    roller = s.rolling(window=3) # window size=3, stride=1

    roller.mean()
    roller.sum()
    roller.corr(another_roller) # correlation coefficient
    roller.cov(another_roller) # covariance
    roller.apply(lambda x:x.mean())

  12. python # expanding window s = pd.Series([1, 3, 6, 10]) s.expanding().mean() # [1], [1, 3], [1, 3, 6], [1, 3, 6, 10] # 1, 2, 3.33, 5

Index

Matplotlib