PySpark Install

PySpark

Install

Environement: WSL ubuntu 18

Conda

install anaconda and create a new environment with python=3.6

Java8

1
2
3
sudo apt-get update
sudo apt-get install openjdk-8-jdk
# java -version

Scala

1
2
3
wget https://downloads.lightbend.com/scala/2.12.8/scala-2.12.8.deb # wget may be timeout, try again
sudo dpkg -i scala-2.12.8.deb
# scala -version

py4j

1
2
pip install py4j
# or try `conda install py4j`

Spark

1
2
3
4
# this command may fail
# check the current version in http://mirrors.tuna.tsinghua.edu.cn/apache/spark
wget http://mirrors.tuna.tsinghua.edu.cn/apache/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
tar xvf spark-2.4.0-bin-hadoop2.7.tgz

Environment

1
2
3
4
5
6
7
8
9
10
vim ~/.bashrc
export SPARK_HOME=<path to spark-2.4.0-bin-hadoop2.7>
export PATH=${SPARK_HOME}/bin:$PATH

source ~/.bashrc
cd <path to spark-2.4.0-bin-hadoop2.7>
cp conf/log4j.properties.template conf/log4j.properties
vim conf/log4j.properties
# replace `log4j.rootCategory=INFO, console` with
log4j.rootCategory=ERROR, console

if you want to use pyspark in jupyter notebook,

1
2
3
4
5
6
7
8
9
vim ~/.bashrc
# find your own path to the zip file starting with 'py4j'
export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip:$PYTHONPATH
export PATH=$SPARK_HOME/python:$PATH
export PYSPARK_PYTHON=python3

source ~/.bashrc
pip install findspark

Enter jupyter notebook, import findspark before pyspark

1
2
3
import findspark
findspark.init()
from pyspark.sql import SparkSession