宋明的一些记录 引入的依赖

    1. //如果要用MLLibSparkHive的话还要加相应的libraryDependencies
    2. //宋明(Spark 2015-07-28 17:36:24
    3. // for breeze
    4. libraryDependencies += "org.scalanlp" %% "breeze" % "0.10" % "provided"
    5. // for breeze
    6. libraryDependencies += "org.scalanlp" %% "breeze-natives" % "0.10" % "provided"
    7. libraryDependencies ++= Seq(
    8. ("org.apache.spark" %% "spark-graphx" % "1.2.1" % "provided").
    9. exclude("org.eclipse.jetty.orbit","javax.transaction").
    10. exclude("org.eclipse.jetty.orbit","javax.servlet").
    11. exclude("org.eclipse.jetty.orbit","javax.mail.glassfish").
    12. exclude("com.esotericsoftware.kryo","kryo").
    13. exclude("commons-beanutils","commons-beanutils").
    14. exclude("commons-collections","commons-collections")
    15. )
    16. libraryDependencies ++= Seq(
    17. ("org.apache.spark" %% "spark-mllib" % "1.2.1" % "provided").
    18. exclude("org.eclipse.jetty.orbit","javax.transaction").
    19. exclude("org.eclipse.jetty.orbit","javax.servlet").
    20. exclude("org.eclipse.jetty.orbit","javax.mail.glassfish").
    21. exclude("com.esotericsoftware.kryo","kryo").
    22. exclude("commons-beanutils","commons-beanutils").
    23. exclude("commons-collections","commons-collections")
    24. )
    25. libraryDependencies ++= Seq(
    26. ("org.apache.spark" %% "spark-core" % "1.2.1" % "provided").
    27. exclude("org.eclipse.jetty.orbit","javax.servlet").
    28. exclude("org.eclipse.jetty.orbit","javax.mail.glassfish").
    29. exclude("org.eclipse.jetty.orbit","javax.activation").
    30. exclude("com.esotericsoftware.kryo","kryo").
    31. exclude("commons-beanutils","commons-beanutils-core").
    32. exclude("commons-beanutils","commons-beanutils")
    33. )
    34. // libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.0.0" % "provided"
    35. libraryDependencies += "org.apache.spark" %% "spark-hive" % "1.0.0" % "provided"

    没有hsql的情况处理日志 没什么意义

    1. hdfs:///home/hdp-like/channel/online/pay_data/2015-06-20/04_pay_channel.log
    2. val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    3. import sqlContext.implicits._
    4. //case class FirstLogin(qid: String, channel: String, app_key:String, login_date: String)
    5. case class FirstLogin(qid: String)
    6. //val people = sc.textFile("hdfs:///home/hdp-like/channel/online/pay_data").map(_.split("\t")).map(p => FirstLogin(p(0), p(1),p(2),p(3))).toDF()
    7. //val rdd = sc.textFile("hdfs:///home/hdp-like/zhushou/db/first_login_detail_qid_channel_appkey/*").map(_.split("\t")).map(p => FirstLogin(p(0), p(1),p(2),p(3))).toDF()
    8. val rdd = sc.textFile("hdfs:///home/hdp-like/zhushou/db/first_login_detail_qid_channel_appkey/*").map(_.split("\t")).map(p => FirstLogin(p(0))).toDF()
    9. rdd.registerTempTable("table_test")
    10. val res = sqlContext.sql("SELECT count(1) FROM table_test")
    11. teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
    12. // or by field name:
    13. teenagers.map(t => "Name: " + t.getAs[String]("name")).collect().foreach(println)
    14. // row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]
    15. teenagers.map(_.getValuesMap[Any](List("name", "age"))).collect().foreach(println)
    16. // Map("name" -> "Justin", "age" -> 19)