Prerequisite
- Apache Spark
- IntelliJ IDEA Community Edition
Walk-through
In this article, I am going to walk-through you all, how to create Spark DataFrame in the Apache Spark application using IntelliJ IDEA Community Edition.part_1_create_first_apache_spark_dataframe.scala
package com.datamaking.apache.spark.dataframe import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class User(user_id: Int, user_name: String, user_city: String) object part_1_create_first_apache_spark_dataframe { def main(args: Array[String]): Unit = { println("Application Started ...") val spark = SparkSession .builder() .appName("Create First Apache Spark DataFrame") .master("local[*]") .getOrCreate() spark.sparkContext.setLogLevel("ERROR") println("Approach 1: ") // Code Block 1 Starts Here val users_list = List((1, "John", "London"), (2, "Martin", "New York"), (3, "Sam", "Sydney"), (4, "Alan", "Mexico City"), (5, "Jacob", "Florida")) // val users_df = spark.createDataFrame(spark.sparkContext.parallelize(users_list)) val df_columns = Seq("user_id", "user_name", "user_city") val users_rdd = spark.sparkContext.parallelize(users_list) val users_df = spark.createDataFrame(users_rdd) users_df.show(5, false) println(users_df.getClass()) val users_df_1 = users_df.toDF(df_columns:_*) println(users_df_1.getClass()) users_df_1.show(5, false) // Code Block 1 Ends Here println("Approach 2: ") // Code Block 2 Starts Here val users_seq = Seq(Row(1, "John", "London"), Row(2, "Martin", "New York"), Row(3, "Sam", "Sydney"), Row(4, "Alan", "Mexico City"), Row(5, "Jacob", "Florida")) val users_schema = StructType(Array( StructField("user_id", IntegerType, true), StructField("user_name", StringType, true), StructField("user_city", StringType, true) )) val users_df_2 = spark.createDataFrame(spark.sparkContext.parallelize(users_seq), users_schema) users_df_2.show(5, false) // Code Block 2 Ends Here println("Approach 3: ") // Code Block 3 Starts Here val case_users_seq = Seq(User(1, "John", "London"), User(2, "Martin", "New York"), User(3, "Sam", "Sydney"), User(4, "Alan", "Mexico City"), User(5, "Jacob", "Florida")) val case_users_rdd = spark.sparkContext.parallelize(case_users_seq) val case_users_df = spark.createDataFrame(case_users_rdd) case_users_df.show(5, false) // Code Block 3 Ends Here spark.stop() println("Application Completed.") } }
build.sbt
name := "apache_spark_dataframe_practical_tutorial" version := "1.0" scalaVersion := "2.11.8" libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.4" // https://mvnrepository.com/artifact/com.databricks/spark-xml libraryDependencies += "com.databricks" %% "spark-xml" % "0.7.0" // https://mvnrepository.com/artifact/mysql/mysql-connector-java libraryDependencies += "mysql" % "mysql-connector-java" % "8.0.18" // https://mvnrepository.com/artifact/org.mongodb.spark/mongo-spark-connector libraryDependencies += "org.mongodb.spark" %% "mongo-spark-connector" % "2.4.1" // https://mvnrepository.com/artifact/com.datastax.spark/spark-cassandra-connector libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "2.4.1" // https://mvnrepository.com/artifact/org.apache.spark/spark-sql-kafka-0-10_2.12 libraryDependencies += "org.apache.spark" %% "spark-sql-kafka-0-10" % "2.4.4" // https://mvnrepository.com/artifact/org.apache.kafka/kafka-clients libraryDependencies += "org.apache.kafka" % "kafka-clients" % "2.3.1"
Summary
In this article, we have successfully created Spark DataFrame in the Apache Spark application using IntelliJ IDEA Community Edition. Please go through all these steps and provide your feedback and post your queries/doubts if you have. Thank you. Appreciated.Happy Learning !!!
0 Comments