https://grouplens.org/datasets/movielens/100k/
用户信息数据
g$ head -10 u.user1|24|M|technician|857112|53|F|other|940433|23|M|writer|320674|24|M|technician|435375|33|F|other|152136|42|M|executive|981017|57|M|administrator|913448|36|M|administrator|052019|29|M|student|0100210|53|M|lawyer|90703
rdd 实践
package cn.bx.sparkimport org.apache.spark.rdd.RDDimport org.apache.spark.sql.{DataFrame, SparkSession}case class User(id: Long, age: Int, gender: String, occupation: String, zip: String)object MovieUser {def main(args: Array[String]): Unit = {val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()val linesRDD: RDD[String] = spark.sparkContext.textFile("ml-100k/u.user")val mapRDD: RDD[Array[String]] = linesRDD.map(_.split("\\|")) // 需要加上转义符号\\val userRDD: RDD[User] = mapRDD.map(fields => User(fields(0).toLong, fields(1).toInt, fields(2), fields(3), fields(3)))import spark.implicits._val dataFrame: DataFrame = userRDD.toDF()dataFrame.createOrReplaceTempView("user")val userResult: DataFrame = spark.sql("select * from user where age >30 limit 10")userResult.show()spark.stop()}}
