1. 注意事项
- 使用 vscode 远程配置开发环境时,python file.py 主要依赖环境变量中 $PYTHONPATH 这个变量,因此需要设置该环境变量,否则无法导入对应pyspark 包; ```bash vim ~/.bashrc
export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH
source ~/.bashrc
<br />测试: <br />运行测试`python wordcount.py`<br />检查运行结果可在`/root/output`文件夹下查看
<a name="b192dada"></a>
## 2. 集成开发环境测试
<a name="f44a6838"></a>
### 2.1 spark local 模式测试
直接 `python wordcount.py`
```python
from pyspark import SparkContext
from pyspark import SparkConf
def CreateSparkContext():
sparkConf=SparkConf().setAppName("WordCounts").set("spark.ui.showCOnsolePrgoress","false")
sc=SparkContext(conf=sparkConf)
print("master="+sc.master)
SetLogger(sc)
SetPath(sc)
return(sc)
def SetLogger(sc):
logger=sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)
def SetPath(sc):
global Path
if sc.master[0:5]=="local":
Path = "/root/pythonwork/test/"
else:
Path="hdfs://master:9000/user/"
if __name__=="__main__":
print("开始执行 WordCount")
sc=CreateSparkContext()
print("开始读取文本文件...")
textFile=sc.textFile(Path+"test.txt")
print("文件共"+str(textFile.count())+"行")
countsRDD=textFile.flatMap(lambda line:line.split(" ")).map(lambda word:(word,1)).reduceByKey(lambda x,y:x+y)
print("文字共"+str(countsRDD.count())+"项数据")
print("开始保存至文本文件...")
countsRDD.saveAsTextFile("/root/pythonwork/test/output")
2.2 创建测试目录
- 创建测试文件上传HDFS目录
mkdir -p ~/pythonwork/project/data
cp /usr/local/spark/README.md ~/pythonwork/project/data
- 启动 Hadoop Multi Node Cluster
start-all.sh
- 创建测试目录
hadoop fs -mkdir -p /user/root/data
hadoop fs -copyFromLocal ~/pythonwork/project/data/README.md /user/root/data
hadoop fs -ls /user/root/data/
2.3 Spark local 模式
- spark-submit 常用选项
--master MASTER_RUL
- Local
- Local[n]
- Local[*]
- spark://HOST:PORT
- mesos L://HOST:PORT
- YARN
--dirver-memory MEM
--executor-memory MEM
--name NAME
- 测试 ```python vim ~/pythonwork/project/wordcount.py
from pyspark import SparkContext from pyspark import SparkConf
def CreateSparkContext(): sparkConf=SparkConf().setAppName(“WordCounts”).set(“spark.ui.showCOnsolePrgoress”,”false”) sc=SparkContext(conf=sparkConf) print(“master=”+sc.master) SetLogger(sc) SetPath(sc) return(sc)
def SetLogger(sc): logger=sc._jvm.org.apache.log4j logger.LogManager.getLogger(“org”).setLevel(logger.Level.ERROR) logger.LogManager.getLogger(“akka”).setLevel(logger.Level.ERROR) logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)
def SetPath(sc): global Path if sc.master[0:5]==”local”: Path = “/root/pythonwork/project/data/“ else: Path=”hdfs://master:9000/user/root/data/“
if name==”main“: print(“开始执行 WordCount”) sc=CreateSparkContext()
print("开始读取文本文件...")
textFile=sc.textFile(Path+"README.md")
print("文件共"+str(textFile.count())+"行")
countsRDD=textFile.flatMap(lambda line:line.split(" ")).map(lambda word:(word,1)).reduceByKey(lambda x,y:x+y)
print("文字共"+str(countsRDD.count())+"项数据")
print("开始保存至文本文件...")
countsRDD.saveAsTextFile("/root/pythonwork/project/data/output"
<a name="8abbb696"></a>
### 2.4 Spark YARN-Client 模式
```bash
cd ~/pythonwork/project/
HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop spark-submit --driver-memory 512m --executor-cores 1 --master yarn --deploy-mode client wordcount.py
2.5 Spark Standalone Cluster 模式
cd ~/pythonwork/project/
/usr/local/spark/sbin/start-all.sh
spark-submit --master spark://master:7077 --deploy-mode client --executor-memory 512m --total-executor-cores 2 wordcount.py