wget https://downloads.lightbend.com/scala/2.12.8/scala-2.12.8.deb # wget may be timeout, try again sudo dpkg -i scala-2.12.8.deb # scala -version
py4j
1 2
pip install py4j # or try `conda install py4j`
Spark
1 2 3 4
# this command may fail # check the current version in http://mirrors.tuna.tsinghua.edu.cn/apache/spark wget http://mirrors.tuna.tsinghua.edu.cn/apache/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz tar xvf spark-2.4.0-bin-hadoop2.7.tgz
Environment
1 2 3 4 5 6 7 8 9 10
vim ~/.bashrc export SPARK_HOME=<path to spark-2.4.0-bin-hadoop2.7> export PATH=${SPARK_HOME}/bin:$PATH
source ~/.bashrc cd <path to spark-2.4.0-bin-hadoop2.7> cp conf/log4j.properties.template conf/log4j.properties vim conf/log4j.properties # replace `log4j.rootCategory=INFO, console` with log4j.rootCategory=ERROR, console
if you want to use pyspark in jupyter notebook,
1 2 3 4 5 6 7 8 9
vim ~/.bashrc # find your own path to the zip file starting with 'py4j' export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip:$PYTHONPATH export PATH=$SPARK_HOME/python:$PATH export PYSPARK_PYTHON=python3
source ~/.bashrc pip install findspark
Enter jupyter notebook, import findspark before
pyspark
1 2 3
import findspark findspark.init() from pyspark.sql import SparkSession