RDD/DataFrame Type Check

RDD and DataFrame instance types can be checked in various ways in python, scala and java.

Check for DataFrame Type

Using Scala Pattern Match

import org.apache.spark.sql.DataFrame

// Load products data from csv
val df = spark.read
              .option("header", "true")
              .csv("csv/products.csv")

val isDF: Boolean = df match {
  case _: DataFrame => true
  case _ => false
}

// isDF value: true

Using Scala isInstanceOf

import org.apache.spark.sql.DataFrame

// Load products data from csv
val df: DataFrame = spark.read
              .option("header", "true")
              .csv("csv/products.csv")

val isDF: Boolean = df.isInstanceOf[DataFrame]

// isDF value: true

Using isinstance() method

from pyspark.sql import DataFrame

# Load products data from csv
df = spark.read
          .option('header', True)
          .csv('csv/products.csv')

# check instance type
is_df = isinstance(df, DataFrame)
# is_df value: True

Using type() method

Another way is to print or log the data type

# Load products data from csv
df = spark.read
          .option('header', True)
          .csv('csv/products.csv')

# print instance type
print(f'df type: {type(df)}') 
# df type: pyspark.sql.dataframe.DataFrame

Using instanceof

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;

Dataset<Row> df = spark.read()
              .option("header", "true")
              .csv("csv/products.csv");

boolean isDF = df instanceof Dataset;
//  isDF value: true

Using isInstance() method

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;

Dataset<Row> df = spark.read()
              .option("header", "true")
              .csv("csv/products.csv");

boolean isDF = Dataset.class.isInstance(df);

//  isDF value: true

Check for RDD Type

Using Scala Pattern Match

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

// Load products data from csv
val rdd: RDD[Row] = spark.read
              .option("header", "true")
              .csv("csv/products.csv")
              .rdd

val isRDD: Boolean = rdd match {
  case _: RDD[_] => true
  case _ => false
}

// isRDD value: true

Using Scala isInstanceOf

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

// Load products data from csv
val rdd: RDD[Row] = spark.read
              .option("header", "true")
              .csv("csv/products.csv")
              .rdd

val isRDD: Boolean = rdd.isInstanceOf[Row[_]]

// isRDD value: true

Using isinstance() method

from pyspark.rdd import RDD

# Load products data from csv
rdd = spark.read
          .option('header', True)
          .csv('csv/products.csv')
          .rdd

# check instance type
is_rdd = isinstance(rdd, RDD)
# is_rdd value: True

Using type() method

Another way is to print or log the data type.

from pyspark.rdd import RDD

# Load products data from csv
rdd = spark.read
          .option('header', True)
          .csv('csv/products.csv')
          .rdd

print object type
print(f'rdd type: {type(rdd)}')
# rdd type: pyspark.rdd.RDD

Using instanceof

import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Row;

RDD<Row> rdd = spark.read()
              .option("header", "true")
              .csv("csv/products.csv")
              .rdd();

boolean isRDD = rdd instanceof RDD;
//  isRDD value: true

Using isInstance() method

import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Row;

RDD<Row> rdd = spark.read()
              .option("header", "true")
              .csv("csv/products.csv")
              .rdd();

boolean isRDD = RDD.class.isInstance(rdd);

//  isRDD value: true
apache spark bigdata rdd dataframe python java scala

Subscribe For More Content