finalpyscript.py

import pyspark
from pyspark.sql.functions import *
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession


sc = SparkContext()

sqlContext = SQLContext(sc)

spark = SparkSession.builder.master("local").appName("app name").config("spark.some.config.option", 'true').getOrCreate()

sampledf1 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-01.csv")

#Sampling data
import pyspark.sql.functions as F
sampledf1 = sampledf1.sample(False, 0.50, seed=0)
sampledf1 = sampledf1.withColumnRenamed(' passenger_count',"passenger_count")
sampledf1 = sampledf1.withColumnRenamed(' trip_distance',"trip_distance")
sampledf1 = sampledf1.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf1 = sampledf1.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf1 = sampledf1.withColumnRenamed(' rate_code',"rate_code")
sampledf1 = sampledf1.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf1 = sampledf1.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf1 = sampledf1.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf1 = sampledf1.withColumnRenamed(' payment_type',"payment_type")
sampledf1 = sampledf1.withColumnRenamed(' fare_amount',"fare_amount")
sampledf1 = sampledf1.withColumnRenamed(' surcharge',"surcharge")
sampledf1 = sampledf1.withColumnRenamed(' mta_tax',"mta_tax")
sampledf1 = sampledf1.withColumnRenamed(' tip_amount',"tip_amount")
sampledf1 = sampledf1.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf1 = sampledf1.withColumnRenamed(' total_amount',"total_amount")
sampledf1 = sampledf1.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf1 = sampledf1.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf1.write.parquet("s3a://nycproject23/sampledrawdata/df1.parquet")


sampledf2 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-02.csv")
import pyspark.sql.functions as F
sampledf2 = sampledf2.sample(False, 0.50, seed=0)
sampledf2 = sampledf2.withColumnRenamed(' passenger_count',"passenger_count")
sampledf2 = sampledf2.withColumnRenamed(' trip_distance',"trip_distance")
sampledf2 = sampledf2.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf2 = sampledf2.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf2 = sampledf2.withColumnRenamed(' rate_code',"rate_code")
sampledf2 = sampledf2.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf2 = sampledf2.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf2 = sampledf2.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf2 = sampledf2.withColumnRenamed(' payment_type',"payment_type")
sampledf2 = sampledf2.withColumnRenamed(' fare_amount',"fare_amount")
sampledf2 = sampledf2.withColumnRenamed(' surcharge',"surcharge")
sampledf2 = sampledf2.withColumnRenamed(' mta_tax',"mta_tax")
sampledf2 = sampledf2.withColumnRenamed(' tip_amount',"tip_amount")
sampledf2 = sampledf2.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf2 = sampledf2.withColumnRenamed(' total_amount',"total_amount")
sampledf2 = sampledf2.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf2 = sampledf2.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf2.write.parquet("s3a://nycproject23/sampledrawdata/df2.parquet")


sampledf3 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-03.csv")
import pyspark.sql.functions as F
sampledf3 = sampledf3.sample(False, 0.50, seed=0)
sampledf3 = sampledf3.withColumnRenamed(' passenger_count',"passenger_count")
sampledf3 = sampledf3.withColumnRenamed(' trip_distance',"trip_distance")
sampledf3 = sampledf3.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf3 = sampledf3.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf3 = sampledf3.withColumnRenamed(' rate_code',"rate_code")
sampledf3 = sampledf3.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf3 = sampledf3.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf3 = sampledf3.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf3 = sampledf3.withColumnRenamed(' payment_type',"payment_type")
sampledf3 = sampledf3.withColumnRenamed(' fare_amount',"fare_amount")
sampledf3 = sampledf3.withColumnRenamed(' surcharge',"surcharge")
sampledf3 = sampledf3.withColumnRenamed(' mta_tax',"mta_tax")
sampledf3 = sampledf3.withColumnRenamed(' tip_amount',"tip_amount")
sampledf3 = sampledf3.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf3 = sampledf3.withColumnRenamed(' total_amount',"total_amount")
sampledf3 = sampledf3.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf3 = sampledf3.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf3.write.parquet("s3a://nycproject23/sampledrawdata/df3.parquet")


sampledf4 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-04.csv")
import pyspark.sql.functions as F
sampledf4 = sampledf4.sample(False, 0.50, seed=0)
sampledf4 = sampledf4.withColumnRenamed(' passenger_count',"passenger_count")
sampledf4 = sampledf4.withColumnRenamed(' trip_distance',"trip_distance")
sampledf4 = sampledf4.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf4 = sampledf4.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf4 = sampledf4.withColumnRenamed(' rate_code',"rate_code")
sampledf4 = sampledf4.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf4 = sampledf4.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf4 = sampledf4.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf4 = sampledf4.withColumnRenamed(' payment_type',"payment_type")
sampledf4 = sampledf4.withColumnRenamed(' fare_amount',"fare_amount")
sampledf4 = sampledf4.withColumnRenamed(' surcharge',"surcharge")
sampledf4 = sampledf4.withColumnRenamed(' mta_tax',"mta_tax")
sampledf4 = sampledf4.withColumnRenamed(' tip_amount',"tip_amount")
sampledf4 = sampledf4.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf4 = sampledf4.withColumnRenamed(' total_amount',"total_amount")
sampledf4 = sampledf4.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf4 = sampledf4.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf4.write.parquet("s3a://nycproject23/sampledrawdata/df4.parquet")


sampledf5 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-05.csv")
import pyspark.sql.functions as F
sampledf5 = sampledf5.sample(False, 0.50, seed=0)
sampledf5 = sampledf5.withColumnRenamed(' passenger_count',"passenger_count")
sampledf5 = sampledf5.withColumnRenamed(' trip_distance',"trip_distance")
sampledf5 = sampledf5.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf5 = sampledf5.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf5 = sampledf5.withColumnRenamed(' rate_code',"rate_code")
sampledf5 = sampledf5.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf5 = sampledf5.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf5 = sampledf5.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf5 = sampledf5.withColumnRenamed(' payment_type',"payment_type")
sampledf5 = sampledf5.withColumnRenamed(' fare_amount',"fare_amount")
sampledf5 = sampledf5.withColumnRenamed(' surcharge',"surcharge")
sampledf5 = sampledf5.withColumnRenamed(' mta_tax',"mta_tax")
sampledf5 = sampledf5.withColumnRenamed(' tip_amount',"tip_amount")
sampledf5 = sampledf5.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf5 = sampledf5.withColumnRenamed(' total_amount',"total_amount")
sampledf5 = sampledf5.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf5 = sampledf5.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf5.write.parquet("s3a://nycproject23/sampledrawdata/df5.parquet")


sampledf6 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-06.csv")
import pyspark.sql.functions as F
sampledf6 = sampledf6.sample(False, 0.50, seed=0)
sampledf6 = sampledf6.withColumnRenamed(' passenger_count',"passenger_count")
sampledf6 = sampledf6.withColumnRenamed(' trip_distance',"trip_distance")
sampledf6 = sampledf6.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf6 = sampledf6.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf6 = sampledf6.withColumnRenamed(' rate_code',"rate_code")
sampledf6 = sampledf6.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf6 = sampledf6.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf6 = sampledf6.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf6 = sampledf6.withColumnRenamed(' payment_type',"payment_type")
sampledf6 = sampledf6.withColumnRenamed(' fare_amount',"fare_amount")
sampledf6 = sampledf6.withColumnRenamed(' surcharge',"surcharge")
sampledf6 = sampledf6.withColumnRenamed(' mta_tax',"mta_tax")
sampledf6 = sampledf6.withColumnRenamed(' tip_amount',"tip_amount")
sampledf6 = sampledf6.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf6 = sampledf6.withColumnRenamed(' total_amount',"total_amount")
sampledf6 = sampledf6.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf6 = sampledf6.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf6.write.parquet("s3a://nycproject23/sampledrawdata/df6.parquet")

sampledf7 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-07.csv")
import pyspark.sql.functions as F
sampledf7 = sampledf7.sample(False, 0.50, seed=0)
sampledf7 = sampledf7.withColumnRenamed(' passenger_count',"passenger_count")
sampledf7 = sampledf7.withColumnRenamed(' trip_distance',"trip_distance")
sampledf7 = sampledf7.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf7 = sampledf7.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf7 = sampledf7.withColumnRenamed(' rate_code',"rate_code")
sampledf7 = sampledf7.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf7 = sampledf7.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf7 = sampledf7.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf7 = sampledf7.withColumnRenamed(' payment_type',"payment_type")
sampledf7 = sampledf7.withColumnRenamed(' fare_amount',"fare_amount")
sampledf7 = sampledf7.withColumnRenamed(' surcharge',"surcharge")
sampledf7 = sampledf7.withColumnRenamed(' mta_tax',"mta_tax")
sampledf7 = sampledf7.withColumnRenamed(' tip_amount',"tip_amount")
sampledf7 = sampledf7.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf7 = sampledf7.withColumnRenamed(' total_amount',"total_amount")
sampledf7 = sampledf7.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf7 = sampledf7.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf7.write.parquet("s3a://nycproject23/sampledrawdata/df7.parquet")


sampledf8 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-08.csv")
import pyspark.sql.functions as F
sampledf8 = sampledf8.sample(False, 0.50, seed=0)
sampledf8 = sampledf8.withColumnRenamed(' passenger_count',"passenger_count")
sampledf8 = sampledf8.withColumnRenamed(' trip_distance',"trip_distance")
sampledf8 = sampledf8.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf8 = sampledf8.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf8 = sampledf8.withColumnRenamed(' rate_code',"rate_code")
sampledf8 = sampledf8.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf8 = sampledf8.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf8 = sampledf8.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf8 = sampledf8.withColumnRenamed(' payment_type',"payment_type")
sampledf8 = sampledf8.withColumnRenamed(' fare_amount',"fare_amount")
sampledf8 = sampledf8.withColumnRenamed(' surcharge',"surcharge")
sampledf8 = sampledf8.withColumnRenamed(' mta_tax',"mta_tax")
sampledf8 = sampledf8.withColumnRenamed(' tip_amount',"tip_amount")
sampledf8 = sampledf8.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf8 = sampledf8.withColumnRenamed(' total_amount',"total_amount")
sampledf8 = sampledf8.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf8 = sampledf8.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf8.write.parquet("s3a://nycproject23/sampledrawdata/df8.parquet")

sampledf9 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-09.csv")
import pyspark.sql.functions as F
sampledf9 = sampledf9.sample(False, 0.50, seed=0)
sampledf9 = sampledf9.withColumnRenamed(' passenger_count',"passenger_count")
sampledf9 = sampledf9.withColumnRenamed(' trip_distance',"trip_distance")
sampledf9 = sampledf9.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf9 = sampledf9.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf9 = sampledf9.withColumnRenamed(' rate_code',"rate_code")
sampledf9 = sampledf9.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf9 = sampledf9.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf9 = sampledf9.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf9 = sampledf9.withColumnRenamed(' payment_type',"payment_type")
sampledf9 = sampledf9.withColumnRenamed(' fare_amount',"fare_amount")
sampledf9 = sampledf9.withColumnRenamed(' surcharge',"surcharge")
sampledf9 = sampledf9.withColumnRenamed(' mta_tax',"mta_tax")
sampledf9 = sampledf9.withColumnRenamed(' tip_amount',"tip_amount")
sampledf9 = sampledf9.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf9 = sampledf9.withColumnRenamed(' total_amount',"total_amount")
sampledf9 = sampledf9.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf9 = sampledf9.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf9.write.parquet("s3a://nycproject23/sampledrawdata/df9.parquet")


sampledf10 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-10.csv")
import pyspark.sql.functions as F
sampledf10 = sampledf10.sample(False, 0.50, seed=0)
sampledf10 = sampledf10.withColumnRenamed(' passenger_count',"passenger_count")
sampledf10 = sampledf10.withColumnRenamed(' trip_distance',"trip_distance")
sampledf10 = sampledf10.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf10 = sampledf10.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf10 = sampledf10.withColumnRenamed(' rate_code',"rate_code")
sampledf10 = sampledf10.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf10 = sampledf10.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf10 = sampledf10.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf10 = sampledf10.withColumnRenamed(' payment_type',"payment_type")
sampledf10 = sampledf10.withColumnRenamed(' fare_amount',"fare_amount")
sampledf10 = sampledf10.withColumnRenamed(' surcharge',"surcharge")
sampledf10 = sampledf10.withColumnRenamed(' mta_tax',"mta_tax")
sampledf10 = sampledf10.withColumnRenamed(' tip_amount',"tip_amount")
sampledf10 = sampledf10.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf10 = sampledf10.withColumnRenamed(' total_amount',"total_amount")
sampledf10 = sampledf10.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf10 = sampledf10.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf10.write.parquet("s3a://nycproject23/sampledrawdata/df10.parquet")

sampledf11 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-11.csv")
import pyspark.sql.functions as F
sampledf11 = sampledf11.sample(False, 0.50, seed=0)
sampledf11 = sampledf11.withColumnRenamed(' passenger_count',"passenger_count")
sampledf11 = sampledf11.withColumnRenamed(' trip_distance',"trip_distance")
sampledf11 = sampledf11.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf11 = sampledf11.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf11 = sampledf11.withColumnRenamed(' rate_code',"rate_code")
sampledf11 = sampledf11.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf11 = sampledf11.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf11 = sampledf11.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf11 = sampledf11.withColumnRenamed(' payment_type',"payment_type")
sampledf11 = sampledf11.withColumnRenamed(' fare_amount',"fare_amount")
sampledf11 = sampledf11.withColumnRenamed(' surcharge',"surcharge")
sampledf11 = sampledf11.withColumnRenamed(' mta_tax',"mta_tax")
sampledf11 = sampledf11.withColumnRenamed(' tip_amount',"tip_amount")
sampledf11 = sampledf11.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf11 = sampledf11.withColumnRenamed(' total_amount',"total_amount")
sampledf11 = sampledf11.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf11 = sampledf11.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf11.write.parquet("s3a://nycproject23/sampledrawdata/df11.parquet")

sampledf12 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-12.csv")
import pyspark.sql.functions as F
sampledf12 = sampledf12.sample(False, 0.50, seed=0)
sampledf12 = sampledf12.withColumnRenamed(' passenger_count',"passenger_count")
sampledf12 = sampledf12.withColumnRenamed(' trip_distance',"trip_distance")
sampledf12 = sampledf12.withColumnRenamed(' pickup_longitude',"pickup_longitude")
sampledf12 = sampledf12.withColumnRenamed(' pickup_latitude',"pickup_latitude")
sampledf12 = sampledf12.withColumnRenamed(' rate_code',"rate_code")
sampledf12 = sampledf12.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
sampledf12 = sampledf12.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
sampledf12 = sampledf12.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
sampledf12 = sampledf12.withColumnRenamed(' payment_type',"payment_type")
sampledf12 = sampledf12.withColumnRenamed(' fare_amount',"fare_amount")
sampledf12 = sampledf12.withColumnRenamed(' surcharge',"surcharge")
sampledf12 = sampledf12.withColumnRenamed(' mta_tax',"mta_tax")
sampledf12 = sampledf12.withColumnRenamed(' tip_amount',"tip_amount")
sampledf12 = sampledf12.withColumnRenamed(' tolls_amount',"tolls_amount")
sampledf12 = sampledf12.withColumnRenamed(' total_amount',"total_amount")
sampledf12 = sampledf12.withColumnRenamed(' pickup_datetime',"pickup_datetime")
sampledf12 = sampledf12.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
sampledf12.write.parquet("s3a://nycproject23/sampledrawdata/df12.parquet")

#Cleaning data
df1 = spark.read.option("header",True).parquet("s3a://nycproject23/sampledrawdata/*.parquet")

df2 = df1.filter("passenger_count != '208'")

df3 = df2.filter((df2.rate_code!='156') & (df2.rate_code!='208') & (df2.rate_code!='210') & (df2.rate_code!='28') & (df2.rate_code!='65') & (df2.rate_code!='77') & (df2.rate_code!='7') & (df2.rate_code!='8') & (df2.rate_code!='9') & (df2.rate_code!='0') & (df2.rate_code!='16'))

final_df = df3.drop('store_and_fwd_flag')

final_df = final_df.na.drop()

#Transformations on pickup and dropoff datetime column(timestamp).
from pyspark.sql.functions import *
df1 = final_df.withColumn('pickup_hour',hour(final_df.pickup_datetime))
df1 =df1.withColumn('dropoff_hour',hour(df1.dropoff_datetime))
df1 = df1.withColumn('pickup_day',date_format(col("pickup_datetime"),"EEEE"))
df1 = df1.withColumn('dropoff_day',date_format(col("dropoff_datetime"),"EEEE"))
df1 = df1.withColumn('pickup_year',year(df1.pickup_datetime))
df1 = df1.withColumn('dropoff_year',year(df1.dropoff_datetime))
df1 = df1.withColumn('pickup_month',month(df1.pickup_datetime))
df1 = df1.withColumn('dropoff_month',month(df1.dropoff_datetime))

df1.createOrReplaceTempView("taxitable")

sqlContext.sql("Create table nyctaxi select * from taxitable")