I need to write a data into json file like in the below format using pyspark.
{"list-item": [
{ {"author":"author1","title":"title1","pages":1,"email":"author1@gmail.com"},
{ {"author":"author2","title":"title2","pages":2,"email":"author2@gmail.com"},
{ {"author":"author3","title":"title3","pages":3,"email":"author3@gmail.com"},
{ {"author":"author4","title":"title4","pages":4,"email":"author4@gmail.com"},
],
],"version": 1}
I have written the below pyspark code but it write "" and adding "" at the beginning and end of each item. How to remove the backslash and double quote
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col,to_json,struct,collect_list,lit
from datetime import datetime
from time import time
if __name__ == '__main__':
spark = SparkSession.builder.appName( spark = SparkSession.builder.appName("Test").enableHiveSupport().getOrCreate()
schema = StructType([
StructField( schema = StructType([ StructField("author", StringType(), False),
StructField( StructField("title", StringType(), False),
StructField( StructField("pages", IntegerType(), False),
StructField( StructField("email", StringType(), False)
])
data = [
[ ]) data = [ ["author1", "title1", 1, "author1@gmail.com"],
[ ["author2", "title2", 2, "author2@gmail.com"],
[ ["author3", "title3", 3, "author3@gmail.com"],
[ ["author4", "title4", 4, "author4@gmail.com"]
]
df = spark.createDataFrame(data, schema)
df=df.select(to_json(struct( ] df = spark.createDataFrame(data, schema) df=df.select(to_json(struct("author", "title", "pages", "email")).alias("json-data")).agg(collect_list("json-data").alias("list-item"))
df=df.withColumn( df=df.withColumn("version",lit("1.0").cast(IntegerType()))
df.printSchema()
df.show( df.printSchema() df.show(2, False)
curDT = datetime.now()
targetPath = curDT.strftime("%m-%d-%Y-%H-%M-%S")
df.write.
df.write.format("json").mode("overwrite").option("escape", "").save(targetPath)
my code writes the json with backslash and double quote enclosed each item like below.how to remove those.Please help
{"list-item":["{\"author\":\"author1\",\"title\":\"title1\",\"pages\":1,\"email\":\"author1@gmail.com\"}","{\"author\":\"author2\",\"title\":\"title2\",\"pages\":2,\"email\":\"author2@gmail.com\"}","{\"author\":\"author3\",\"title\":\"title3\",\"pages\\":3,\"email\":\"author3@gmail.com\\"}","{\"author\":\"author4\",\"title\":\"title4\",\"pages\":4,\"email\":\"author4@gmail.com\"}"],"version":1}