# -*- coding: utf-8 -*- """ author SparkByExamples.com """ from pyspark.sql import SparkSession spark = SparkSession.builder.master("local[1]") \ .appName('SparkByExamples.com') \ .getOrCreate() data = [("James","Smith","USA","CA"),("Michael","Rose","USA","NY"), \ ("Robert","Williams","USA","CA"),("Maria","Jones","USA","FL") \ ] columns=["firstname","lastname","country","state"] df=spark.createDataFrame(data=data,schema=columns) df.show() print(df.collect()) states1=df.rdd.map(lambda x: x[3]).collect() print(states1) #['CA', 'NY', 'CA', 'FL'] from collections import OrderedDict res = list(OrderedDict.fromkeys(states1)) print(res) #['CA', 'NY', 'FL'] #Example 2 states2=df.rdd.map(lambda x: x.state).collect() print(states2) #['CA', 'NY', 'CA', 'FL'] states3=df.select(df.state).collect() print(states3) #[Row(state='CA'), Row(state='NY'), Row(state='CA'), Row(state='FL')] states4=df.select(df.state).rdd.flatMap(lambda x: x).collect() print(states4) #['CA', 'NY', 'CA', 'FL'] states5=df.select(df.state).toPandas()['state'] states6=list(states5) print(states6) #['CA', 'NY', 'CA', 'FL'] pandDF=df.select(df.state,df.firstname).toPandas() print(list(pandDF['state'])) print(list(pandDF['firstname']))