数据湖探索 DLI-pyspark样例代码:完整示例代码

时间:2024-06-20 11:23:10

完整示例代码

  • 通过SQL API访问 MRS 的OpenTSDB
    # _*_ coding: utf-8 _*_
    from __future__ import print_function
    from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType
    from pyspark.sql import SparkSession
    
    if __name__ == "__main__":
      # Create a SparkSession session.    
      sparkSession = SparkSession.builder.appName("datasource-opentsdb").getOrCreate()
    
    
      # Create a  DLI  cross-source association opentsdb data table    
      sparkSession.sql(\
        "create table opentsdb_test using opentsdb options(\
        'Host'='10.0.0.171:4242',\
        'metric'='cts_opentsdb',\
        'tags'='city,location')")
    
      sparkSession.sql("insert into opentsdb_test values('aaa', 'abc', '2021-06-30 18:00:00', 30.0)")
    
      result = sparkSession.sql("SELECT * FROM opentsdb_test")
      result.show()
    
      # close session 
      sparkSession.stop()
  • 通过DataFrame API访问OpenTSDB
    # _*_ coding: utf-8 _*_
    from __future__ import print_function
    from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType
    from pyspark.sql import SparkSession
    
    if __name__ == "__main__":
      # Create a SparkSession session.    
      sparkSession = SparkSession.builder.appName("datasource-opentsdb").getOrCreate()
    
      # Create a DLI cross-source association opentsdb data table    
      sparkSession.sql(
        "create table opentsdb_test using opentsdb options(\
        'Host'='opentsdb-3xcl8dir15m58z3.cloudtable.com:4242',\
        'metric'='ct_opentsdb',\
        'tags'='city,location')")
    
      # Create a DataFrame and initialize the DataFrame data.    
      dataList = sparkSession.sparkContext.parallelize([("aaa", "abc", 123456L, 30.0)])
    
      # Setting schema   
      schema = StructType([StructField("location", StringType()),\                 
                           StructField("name", StringType()),\                
                           StructField("timestamp", LongType()),\               
                           StructField("value", DoubleType())])
    
      # Create a DataFrame from RDD and schema   
      dataFrame = sparkSession.createDataFrame(dataList, schema)
    
      # Set cross-source connection parameters   
      metric = "ctopentsdb"   
      tags = "city,location"  
      Host = "opentsdb-3xcl8dir15m58z3.cloudtable.com:4242"
    
      # Write data to the cloudtable-opentsdb   
      dataFrame.write.insertInto("opentsdb_test")
      # ******* Opentsdb does not currently implement the ctas method to save data, so the save() method cannot be used.*******   
      # dataFrame.write.format("opentsdb").option("Host", Host).option("metric", metric).option("tags", tags).mode("Overwrite").save()   
    
      # Read data on CloudTable-OpenTSDB  
      jdbdDF = sparkSession.read\       
          .format("opentsdb")\      
          .option("Host",Host)\     
          .option("metric",metric)\  
          .option("tags",tags)\    
          .load()   
      jdbdDF.show()
    
      # close session 
      sparkSession.stop()
support.huaweicloud.com/devg-dli/dli_09_0081.html