数据湖探索 DLI-pyspark样例代码:完整示例代码

时间:2024-06-20 11:23:10

完整示例代码

  • 通过DataFrame API 访问
     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    # _*_ coding: utf-8 _*_
    from __future__ import print_function
    from pyspark.sql.types import StructType, StructField, IntegerType, StringType
    from pyspark.sql import SparkSession
    if __name__ == "__main__":
      # Create a SparkSession session.    
      sparkSession = SparkSession.builder.appName("datasource-redis").getOrCreate()
      
      # Set cross-source connection parameters.
      host = "192.168.4.199"
      port = "6379"
      table = "person"  
      auth = "######"
         
      # Create a DataFrame and initialize the DataFrame data.    
      # *******   method noe   *********    
      dataList = sparkSession.sparkContext.parallelize([(1, "Katie", 19),(2,"Tom",20)])
      schema = StructType([StructField("id", IntegerType(), False),StructField("name", StringType(), False),StructField("age", IntegerType(), False)])
      dataFrame_one = sparkSession.createDataFrame(dataList, schema)
    
      # ****** method two ******  
      # jdbcDF = sparkSession.createDataFrame([(3,"Jack", 23)])
      # dataFrame = jdbcDF.withColumnRenamed("_1", "id").withColumnRenamed("_2", "name").withColumnRenamed("_3", "age")
      
      # Write data to the redis table  
      dataFrame.write.format("redis").option("host", host).option("port", port).option("table", table).option("password", auth).mode("Overwrite").save()  
      # Read data  
      sparkSession.read.format("redis").option("host", host).option("port", port).option("table", table).option("password", auth).load().show()
      
      # close session  
      sparkSession.stop()
    
  • 通过SQL API 访问
     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    # _*_ coding: utf-8 _*_
    from __future__ import print_function
    from pyspark.sql import SparkSession
    
    if __name__ == "__main__":
      # Create a SparkSession  
      sparkSession = SparkSession.builder.appName("datasource_redis").getOrCreate()
    
      sparkSession.sql(
        "CREATE TEMPORARY VIEW person (name STRING, age INT) USING org.apache.spark.sql.redis OPTIONS (\
        'host' = '192.168.4.199', \
        'port' = '6379',\
        'password' = '######',\
        'table'= 'person')".stripMargin); 
    
      sparkSession.sql("INSERT INTO TABLE person VALUES ('John', 30),('Peter', 45)".stripMargin)
      
      sparkSession.sql("SELECT * FROM person".stripMargin).collect().foreach(println)
       
      # close session  
      sparkSession.stop()
    
support.huaweicloud.com/devg-dli/dli_09_0097.html