Spark WordCount - 《大数据开发文档》

1.在IDEA里面新建maven工程
2..在IDEA中编写

package com.ligong.hello
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
  def main(args: Array[String]): Unit = {
    if(args.length < 2){
      println("请指定input和output路径");
      System.exit(1)
    }
    //1.准备spark上下文
    val conf: SparkConf = new SparkConf().setAppName("WordCount")
    val sc: SparkContext = new SparkContext(conf);
    //2.读取文件生成RDD
    val lines: RDD[String] = sc.textFile("")
    //3.调用transformation算子进行RDD转换
    val words: RDD[String] = lines.flatMap(_.split(args(0)))
    //4.将RDD转化为键值对
    val wordAndOnes: RDD[ (String,Int) ] = words.map((_,1))
    //5.聚合操作
    val result: RDD[(String, Int)] = wordAndOnes.reduceByKey(_+_)
    System.setProperty("HADOOP_USER_NAME","root")
    result.repartition(1).saveAsTextFile(args(1))
    sc.stop()
  }
}

3.打包成jar包
4.上传到 /export/spark-jar/

5.赋予权限：

chmod -R 777  wc.jar

6.提交运行：

spark-submit \
--master yarn \
--deploy-mode cluster \
--driver-memory 512m \
--executor-memory 512m \
--num-executors 1 \
--class com.ligong.hello.WordCount \
/export/spark-jar/wc.jar \
hdfs://node1:8020/wordcount/input/words.txt \
hdfs://node1:8020/wordcount/output