RDD/DataFrame Type Check
RDD and DataFrame instance types can be checked in various ways in python, scala and java.
On this page
Check for DataFrame Type
Using Scala Pattern Match
import org.apache.spark.sql.DataFrame
// Load products data from csv
val df = spark.read
.option("header", "true")
.csv("csv/products.csv")
val isDF: Boolean = df match {
case _: DataFrame => true
case _ => false
}
// isDF value: true
Using Scala isInstanceOf
import org.apache.spark.sql.DataFrame
// Load products data from csv
val df: DataFrame = spark.read
.option("header", "true")
.csv("csv/products.csv")
val isDF: Boolean = df.isInstanceOf[DataFrame]
// isDF value: true
Using isinstance() method
from pyspark.sql import DataFrame
# Load products data from csv
df = spark.read
.option('header', True)
.csv('csv/products.csv')
# check instance type
is_df = isinstance(df, DataFrame)
# is_df value: True
Using type() method
Another way is to print or log the data type
# Load products data from csv
df = spark.read
.option('header', True)
.csv('csv/products.csv')
# print instance type
print(f'df type: {type(df)}')
# df type: pyspark.sql.dataframe.DataFrame
Using instanceof
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
Dataset<Row> df = spark.read()
.option("header", "true")
.csv("csv/products.csv");
boolean isDF = df instanceof Dataset;
// isDF value: true
Using isInstance() method
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
Dataset<Row> df = spark.read()
.option("header", "true")
.csv("csv/products.csv");
boolean isDF = Dataset.class.isInstance(df);
// isDF value: true
Check for RDD Type
Using Scala Pattern Match
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
// Load products data from csv
val rdd: RDD[Row] = spark.read
.option("header", "true")
.csv("csv/products.csv")
.rdd
val isRDD: Boolean = rdd match {
case _: RDD[_] => true
case _ => false
}
// isRDD value: true
Using Scala isInstanceOf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
// Load products data from csv
val rdd: RDD[Row] = spark.read
.option("header", "true")
.csv("csv/products.csv")
.rdd
val isRDD: Boolean = rdd.isInstanceOf[Row[_]]
// isRDD value: true
Using isinstance() method
from pyspark.rdd import RDD
# Load products data from csv
rdd = spark.read
.option('header', True)
.csv('csv/products.csv')
.rdd
# check instance type
is_rdd = isinstance(rdd, RDD)
# is_rdd value: True
Using type() method
Another way is to print or log the data type.
from pyspark.rdd import RDD
# Load products data from csv
rdd = spark.read
.option('header', True)
.csv('csv/products.csv')
.rdd
print object type
print(f'rdd type: {type(rdd)}')
# rdd type: pyspark.rdd.RDD
Using instanceof
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Row;
RDD<Row> rdd = spark.read()
.option("header", "true")
.csv("csv/products.csv")
.rdd();
boolean isRDD = rdd instanceof RDD;
// isRDD value: true
Using isInstance() method
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Row;
RDD<Row> rdd = spark.read()
.option("header", "true")
.csv("csv/products.csv")
.rdd();
boolean isRDD = RDD.class.isInstance(rdd);
// isRDD value: true