以列的(列名,列的类型。列值)的形式构成的分布式数据集,按照列赋予不同名称,约等于关系数据库的数据表

A DataFrame is a Dataset organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. DataFrames can be constructed from a wide array of sources such as: structured data files, tables in Hive, external databases, or existing RDDs. In Scala and Java, a DataFrame is represented by a Dataset of Rows.
In the Scala API DataFrame is simply a type alias of Dataset[Row].
in Java API, users need to use Dataset<Row> to represent a DataFrame.

API操作

printSchema

打印Schema信息,以树形结构输出

  1. import org.apache.spark.sql.{DataFrame, SparkSession}
  2. object DataFrameApp {
  3. def main(args: Array[String]): Unit = {
  4. val spark: SparkSession = SparkSession.builder().
  5. appName("DataFrameApp").
  6. master("local[*]").
  7. getOrCreate()
  8. val peopleDF: DataFrame = spark.read.json("resources/people.json")
  9. peopleDF.printSchema()
  10. spark.stop()
  11. }
  12. }

打印结果

  1. root
  2. |-- age: long (nullable = true)
  3. |-- name: string (nullable = true)

show

默认展示20条数据 ,通过参数指定展示的条数

  1. package cn.bx.spark
  2. import org.apache.spark.sql.{DataFrame, SparkSession}
  3. object DataFrameApp {
  4. def main(args: Array[String]): Unit = {
  5. val spark: SparkSession = SparkSession.builder().
  6. appName("DataFrameApp").
  7. master("local[*]").
  8. getOrCreate()
  9. val peopleDF: DataFrame = spark.read.json("resources/people.json")
  10. peopleDF.show(1)
  11. spark.stop()
  12. }
  13. }

打印结果

  1. +----+-------+
  2. | age| name|
  3. +----+-------+
  4. |null|Michael|
  5. +----+-------+
  6. only showing top 1 row

SLECT

指定输出列

  1. package cn.bx.spark
  2. import org.apache.spark.sql.{DataFrame, SparkSession}
  3. object DataFrameApp {
  4. def main(args: Array[String]): Unit = {
  5. val spark: SparkSession = SparkSession.builder().
  6. appName("DataFrameApp").
  7. master("local[*]").
  8. getOrCreate()
  9. val peopleDF: DataFrame = spark.read.json("resources/people.json")
  10. peopleDF.select("name","age").show()
  11. spark.stop()
  12. }
  13. }

打印结果

  1. +-------+----+
  2. | name| age|
  3. +-------+----+
  4. |Michael|null|
  5. | Andy| 30|
  6. | Justin| 19|
  7. +-------+----+

修改数据

  1. peopleDF.select(peopleDF.col("name"),peopleDF.col("age") + 1).show()

打印结果

  1. +-------+---------+
  2. | name|(age + 1)|
  3. +-------+---------+
  4. |Michael| null|
  5. | Andy| 31|
  6. | Justin| 20|
  7. +-------+---------+

语法糖$

  1. package cn.bx.spark
  2. import org.apache.spark.sql.{DataFrame, SparkSession}
  3. object DataFrameApp {
  4. def main(args: Array[String]): Unit = {
  5. val spark: SparkSession = SparkSession.builder().
  6. appName("DataFrameApp").
  7. master("local[*]").
  8. getOrCreate()
  9. val peopleDF: DataFrame = spark.read.json("resources/people.json")
  10. import spark.implicits._
  11. peopleDF.select($"name", $"age" + 1).show()
  12. spark.stop()
  13. }
  14. }

filter

条件过滤

  1. package cn.bx.spark
  2. import org.apache.spark.sql.{DataFrame, SparkSession}
  3. object DataFrameApp {
  4. def main(args: Array[String]): Unit = {
  5. val spark: SparkSession = SparkSession.builder().
  6. appName("DataFrameApp").
  7. master("local[*]").
  8. getOrCreate()
  9. val peopleDF: DataFrame = spark.read.json("resources/people.json")
  10. peopleDF.filter(peopleDF.col("age")>19).show()
  11. spark.stop()
  12. }
  13. }

打印结果

  1. +---+----+
  2. |age|name|
  3. +---+----+
  4. | 30|Andy|
  5. +---+----+

groupBy

  1. package cn.bx.spark
  2. import org.apache.spark.sql.{DataFrame, SparkSession}
  3. object DataFrameApp {
  4. def main(args: Array[String]): Unit = {
  5. val spark: SparkSession = SparkSession.builder().
  6. appName("DataFrameApp").
  7. master("local[*]").
  8. getOrCreate()
  9. val peopleDF: DataFrame = spark.read.json("resources/people.json")
  10. peopleDF.groupBy(peopleDF.col("age")).count().show()
  11. spark.stop()
  12. }
  13. }

打印结果

  1. +----+-----+
  2. | age|count|
  3. +----+-----+
  4. | 19| 1|
  5. |null| 1|
  6. | 30| 1|
  7. +----+-----+