1.在IDEA里面新建maven工程
2..在IDEA中编写
package com.ligong.hello
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
def main(args: Array[String]): Unit = {
if(args.length < 2){
println("请指定input和output路径");
System.exit(1)
}
//1.准备spark上下文
val conf: SparkConf = new SparkConf().setAppName("WordCount")
val sc: SparkContext = new SparkContext(conf);
//2.读取文件生成RDD
val lines: RDD[String] = sc.textFile("")
//3.调用transformation算子进行RDD转换
val words: RDD[String] = lines.flatMap(_.split(args(0)))
//4.将RDD转化为键值对
val wordAndOnes: RDD[ (String,Int) ] = words.map((_,1))
//5.聚合操作
val result: RDD[(String, Int)] = wordAndOnes.reduceByKey(_+_)
System.setProperty("HADOOP_USER_NAME","root")
result.repartition(1).saveAsTextFile(args(1))
sc.stop()
}
}
3.打包成jar包
4.上传到 /export/spark-jar/
5.赋予权限:
chmod -R 777 wc.jar
6.提交运行:
spark-submit \
--master yarn \
--deploy-mode cluster \
--driver-memory 512m \
--executor-memory 512m \
--num-executors 1 \
--class com.ligong.hello.WordCount \
/export/spark-jar/wc.jar \
hdfs://node1:8020/wordcount/input/words.txt \
hdfs://node1:8020/wordcount/output
7.