阅读卡桑德拉使用Spark流

我有一个问题，当我使用spark流读取从Cassandra

https://github.com/datastax/spark-cassandra-connector/blob/master/doc/8_streaming.md reading-from-cassandra-from-the-streamingcontext

如上面的链接，我使用

val rdd = ssc.cassandraTable("streaming_test", "key_value").select("key", "value").where("fu = ?", 3)

从cassandra中选择数据，但似乎spark流只有一次查询，但我希望它继续使用间隔10秒查询。

我的代码如下，希望你的回应。

谢谢!

import org.apache.spark._
import org.apache.spark.streaming._
import com.datastax.spark.connector.streaming._
import org.apache.spark.rdd._
import scala.collection.mutable.Queue

object SimpleApp {
def main(args: Array[String]){
    val conf = new SparkConf().setAppName("scala_streaming_test").set("spark.cassandra.connection.host", "127.0.0.1")
    val ssc = new StreamingContext(conf, Seconds(10))
    val rdd = ssc.cassandraTable("mykeyspace", "users").select("fname", "lname").where("lname = ?", "yu")
    //rdd.collect().foreach(println)
    val rddQueue = new Queue[RDD[com.datastax.spark.connector.CassandraRow]]()

    val dstream = ssc.queueStream(rddQueue)
    dstream.print()
    ssc.start()
    rdd.collect().foreach(println)
    rddQueue += rdd
    ssc.awaitTermination()
}

}

可以用CassandraRDD作为输入创建ConstantInputDStream。ConstantInputDStream将在每个流间隔上提供相同的RDD，并且通过在该RDD上执行一个操作，您将触发RDD血统的具体化，从而每次都在Cassandra上执行查询。

确保被查询的数据不会无限制地增长，以避免增加查询次数并导致不稳定的流处理。

类似这样的内容应该可以达到目的(使用您的代码作为起始点):

import org.apache.spark.streaming.dstream.ConstantInputDStream
val ssc = new StreamingContext(conf, Seconds(10))
val cassandraRDD = ssc.cassandraTable("mykeyspace", "users").select("fname", "lname").where("lname = ?", "yu")
val dstream = new ConstantInputDStream(ssc, cassandraRDD)
dstream.foreachRDD{ rdd => 
    // any action will trigger the underlying cassandra query, using collect to have a simple output
    println(rdd.collect.mkString("n")) 
}
ssc.start()
ssc.awaitTermination()

我有同样的问题，并通过创建InputDStream类的子类找到了解决方案。有必要定义start()和compute()方法。

start()可用于制备。主要逻辑在compute()中。它将返回Option[RDD[T]]。为了使类更灵活，定义了InputStreamQuery trait。

trait InputStreamQuery[T] {
  // where clause condition for partition key
  def partitionCond : (String, Any)
  // function to return next partition key
  def nextValue(v:Any) : Option[Any]
  // where clause condition for clustering key
  def whereCond : (String, (T) => Any)
  // batch size
  def batchSize : Int
}

对于Cassandra表keyspace.test，创建test_by_date，通过分区键date重新组织表。

CREATE TABLE IF NOT exists keyspace.test
(id timeuuid, date text, value text, primary key (id))
CREATE MATERIALIZED VIEW IF NOT exists keyspace.test_by_date AS
SELECT *
FROM  keyspace.test
WHERE id IS NOT NULL 
PRIMARY KEY (date, id)
WITH CLUSTERING ORDER BY ( id ASC );

test表的一个可能实现应该是

class class Test(id:UUID, date:String, value:String)
trait InputStreamQueryTest extends InputStreamQuery[Test] {
  val dateFormat = "uuuu-MM-dd"
  // set batch size as 10 records
  override def batchSize: Int = 10
  // partitioning key conditions, query string and initial value
  override def partitionCond: (String, Any) = ("date = ?", "2017-10-01")
  // clustering key condition, query string and function to get clustering key from the instance
  override def whereCond: (String, Test => Any) = (" id > ?", m => m.id)
  // return next value of clustering key. ex) '2017-10-02' for input value '2017-10-01'
  override def nextValue(v: Any): Option[Any] = {
    import java.time.format.DateTimeFormatter
    val formatter = DateTimeFormatter.ofPattern( dateFormat)
    val nextDate = LocalDate.parse(v.asInstanceOf[String], formatter).plusDays(1)
    if ( nextDate.isAfter( LocalDate.now()) ) None
    else Some( nextDate.format(formatter))
  }
}

它可以在CassandraInputStream类中使用如下。

class CassandraInputStream[T: ClassTag]
(_ssc: StreamingContext, keyspace:String, table:String)
(implicit rrf: RowReaderFactory[T], ev: ValidRDDType[T]) 
extends InputDStream[T](_ssc) with InputStreamQuery[T] {
var lastElm:Option[T] = None
var partitionKey : Any = _
override def start(): Unit = {
  // find a partition key which stores some records
  def findStartValue(cql : String, value:Any): Any = {
    val rdd  = _ssc.sparkContext.cassandraTable[T](keyspace, table).where(cql, value).limit(1)
    if (rdd.cassandraCount() > 0 ) value
    else {
      nextValue(value).map( findStartValue( cql, _)).getOrElse( value)
    }
  }
  // get query string and initial value from partitionCond method
  val (cql, value) = partitionCond
  partitionKey = findStartValue(cql, value)
}
override def stop(): Unit = {}
override def compute(validTime: Time): Option[RDD[T]] = {
  val (cql, _) = partitionCond
  val (wh, whKey) = whereCond
  def fetchNext( patKey: Any) : Option[CassandraTableScanRDD[T]] = {
    // query with partitioning condition
    val query = _ssc.sparkContext.cassandraTable[T](keyspace, table).where( cql, patKey)
    val rdd = lastElm.map{ x =>
      query.where( wh, whKey(x)).withAscOrder.limit(batchSize)
    }.getOrElse( query.withAscOrder.limit(batchSize))
    if ( rdd.cassandraCount() > 0 ) {
      // store the last element of this RDD
      lastElm = Some(rdd.collect.last)
      Some(rdd)
    }
    else {
      // find the next partition key which stores data
      nextValue(patKey).flatMap{ k =>
        partitionKey = k
        fetchNext(k)}
    }
  }
  fetchNext( partitionKey)
}
}

组合所有类，

val conf = new SparkConf().setAppName(appName).setMaster(master)
val ssc = new StreamingContext(conf, Seconds(10))
val dstream = new CassandraInputStream[Test](ssc, "keyspace", "test_by_date") with InputStreamQueryTest
dstream.map(println).saveToCassandra( ... )
ssc.start()
ssc.awaitTermination()

相关内容

最新更新

热门标签：