https://grouplens.org/datasets/movielens/100k/

用户信息数据

  1. g$ head -10 u.user
  2. 1|24|M|technician|85711
  3. 2|53|F|other|94043
  4. 3|23|M|writer|32067
  5. 4|24|M|technician|43537
  6. 5|33|F|other|15213
  7. 6|42|M|executive|98101
  8. 7|57|M|administrator|91344
  9. 8|36|M|administrator|05201
  10. 9|29|M|student|01002
  11. 10|53|M|lawyer|90703

rdd 实践

  1. package cn.bx.spark
  2. import org.apache.spark.rdd.RDD
  3. import org.apache.spark.sql.{DataFrame, SparkSession}
  4. case class User(id: Long, age: Int, gender: String, occupation: String, zip: String)
  5. object MovieUser {
  6. def main(args: Array[String]): Unit = {
  7. val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
  8. val linesRDD: RDD[String] = spark.sparkContext.textFile("ml-100k/u.user")
  9. val mapRDD: RDD[Array[String]] = linesRDD.map(_.split("\\|")) // 需要加上转义符号\\
  10. val userRDD: RDD[User] = mapRDD.map(fields => User(fields(0).toLong, fields(1).toInt, fields(2), fields(3), fields(3)))
  11. import spark.implicits._
  12. val dataFrame: DataFrame = userRDD.toDF()
  13. dataFrame.createOrReplaceTempView("user")
  14. val userResult: DataFrame = spark.sql("select * from user where age >30 limit 10")
  15. userResult.show()
  16. spark.stop()
  17. }
  18. }