PySpark - Select columns by datatype in DataFrame

  1. Install PySpark module
  2. Create a DataFrame with schema fields
  3. Get the column types using different data types
  4. Display the data
pip install pyspark


import pyspark

from pyspark.sql import SparkSession

from pyspark.sql.types import StringType, DoubleType,IntegerType,StructType, StructField,FloatType

spark = SparkSession.builder.appName('kontexttech').getOrCreate()

values = [(1, "Gottumukkala Sravan Kumar",4500.00), (2, "Bobby",93445.000), (3, "Gnanesh",88900.000)]

schema = StructType([
StructField("rollno", IntegerType(), True),StructField("name", StringType(), True),StructField("fee", FloatType(), True),])

data = spark.createDataFrame(values, schema)

print(data [[ for i in data.schema.fields if isinstance(i.dataType, IntegerType)]].collect())

print(data [[ for i in data.schema.fields if isinstance(i.dataType, StringType)]].collect())

print(data [[ for i in data.schema.fields if isinstance(i.dataType,FloatType)]].collect())


[Row(rollno=1), Row(rollno=2), Row(rollno=3)]

[Row(name='Gottumukkala Sravan Kumar'), Row(name='Bobby'), Row(name='Gnanesh')]

[Row(fee=4500.0), Row(fee=93445.0), Row(fee=88900.0)]

