低阶workcount
package tcode.chapter07
import scala.io.Source
object $17_WordCountLow {
def main(args: Array[String]): Unit = {
//1、读取文件
val datas = Source.fromFile("datas/wc2.txt","utf-8").getLines().toList
//List(hello hadoop flume kafka,kafka spark scala hadoop,hello java python hadoop,kafka flume spark spark,hello flume scala java)
//2、切割+压平
val words = datas.flatMap(line=> line.split(" "))
//List(hello,hadoop,flume,kafka,kafka,spark,....)
//3、按照单词分组
val groupedMap = words.groupBy(x=>x)
//Map(
// hello-> List(hello,hello,hello,hello,..)
// ...
// )
//
//4、统计次数
val result = groupedMap.map(x=>{
//x = hello-> List(hello,hello,hello,hello,..)
(x._1,x._2.size)
})
//List( (单词,总次数),(单词,总次数),... )
result.foreach(x=>println(x))
println("-"*100)
//
Source.fromFile("datas/wc.txt","utf-8").getLines().toList.flatMap(_.split(" ")).groupBy(x=>x).map(x=>(x._1,x._2.size)).foreach(println(_))
}
}
高阶workcount
package tcode.chapter07
object $18_WordCountHight {
def main(args: Array[String]): Unit = {
val tupleList = List(("Hello Scala Spark World", 4), ("Hello Scala Spark", 3), ("Hello Scala", 2), ("Hello", 1))
//1、切割+压平,给单词赋予初始次数
val words = tupleList.flatMap(x=>{
//x = ("Hello Scala Spark World", 4)
//切割
val arr = x._1.split(" ")
//Array(Hello,Scala,Spark,Word)
val tu = arr.map(y=>{
//y = Hello
(y,x._2)
})
tu
})
//List( (Hello,4),(Scala,4),(Spark,4),(Word,4),(Hello,3),(Scala,3),(Spark,3),(Hello,2),(Scala,2),(Hello,1) )
//2、按照单词分组
val groupedMap = words.groupBy(x=> x._1)
//Map(
// Hello-> List( (Hello,4), (Hello,3), (Hello,2), (Hello,1) )
// ....
// )
//3、统计单词个数
val result = groupedMap.map(x=>{
//x = Hello-> List( (Hello,4), (Hello,3), (Hello,2), (Hello,1) )
//val r = x._2.reduce((agg,curr)=> (agg._1, agg._2+curr._2))
//r
val r = x._2.map(y=>y._2).sum
(x._1, r )
})
//4、结果展示
result.foreach(x=>println(x))
//List((Hello,10),(Scala,9),(Spark,7),(World,4))
}
}