Delete science/search/ingester/config directory
This commit is contained in:
parent
921e154bef
commit
d52379152a
|
@ -1,2 +0,0 @@
|
|||
## Ingester Configs
|
||||
This directory contains pipeline configurations for the tweet ingesters (realtime, protected and realtime_cg) and the user-updates ingester. The pipeline configurations define an ordered sequence of stages that the tweet or user update goes through before reaching Earlybird. Source code for the various stages referenced in the configs can be found at src/java/com/twitter/search/ingester/pipeline/twitter.
|
|
@ -1,30 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
|
||||
<!--
|
||||
This indexer reads UserModification from user_modification Kafka topic, converts the
|
||||
data into AntisocialUserUpdate by querying Gizmoduck and then writes the data to the
|
||||
the search_user_updates Kafka topic.
|
||||
-->
|
||||
<pipeline>
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="pipeline">
|
||||
|
||||
<!-- This queue is a factor of batchSize larger than inner queues because it is unbatched -->
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="500"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.userupdates.UserUpdatesPipelineStage"
|
||||
environment="prod"
|
||||
driverFactoryId="pipeline"/>
|
||||
</pipeline>
|
|
@ -1,202 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
|
||||
<!-- Ingesters process tweet create events from TweetyPie and write them to a queue for Earlybird
|
||||
to index. -->
|
||||
<pipeline>
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka">
|
||||
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<!-- Read tweets from the thrift kafka queue. The reader loops forever. -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.KafkaRawRecordConsumerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
kafkaConsumerGroupId=""
|
||||
maxPollRecords="1"
|
||||
pollTimeoutMs="1000"
|
||||
partitioned="false"
|
||||
deciderKey=""
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Deserialize the bytes into TweetData -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TweetEventDeserializerStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Filter to only have the safetytype for this cluster -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.FilterEventsBySafetyTypeStage"
|
||||
tweetCreateLatencyLogThresholdMillis="5000"
|
||||
safetyType="PROTECTED"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse to TwitterMessage -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ThriftTweetParserStage"
|
||||
tweetDeleteEventBranchNames="kafka_update_events_delete"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<branch>
|
||||
<pipeline key="kafka_update_events_delete">
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka_update_events_delete">
|
||||
|
||||
<!-- we are willing to queue more deletes than other stages,
|
||||
to make sure we don't slow down the incoming tweets -->
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.DeleteUpdateEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
driverFactoryId="kafka_update_events_delete"/>
|
||||
</pipeline>
|
||||
</branch>
|
||||
|
||||
|
||||
<!-- filters out messages that are not formatted correctly -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.FilterTwitterMessageStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- retrieves space ids from space urls if the tweet has space urls -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceIdsStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
|
||||
<!-- looks up user reputation scores for each message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.LookupUserPropertiesBatchedStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- extract text features of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextFeatureExtractionWorkersStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- compute text quality score of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextQualityEvaluationWorkerStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Extract lat/lon pairs from the text, and geocode them -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.SingleTweetExtractAndGeocodeLatLonStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- adds coded locations -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.PopulateCodedLocationsBatchedStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse the TwitterMessages into ThriftStatuses -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ConvertMessageToThriftStage"
|
||||
thriftVersionedEventsBranchName="kafka_base_tweets"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Branch for tweets -->
|
||||
<branch>
|
||||
<pipeline key="kafka_base_tweets">
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka_base_tweets">
|
||||
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId="search_ingester_indexing_events"
|
||||
kafkaTopicName="search_ingester_indexing_events_protected_prod"
|
||||
driverFactoryId="kafka_base_tweets"/>
|
||||
</pipeline>
|
||||
</branch>
|
||||
|
||||
<!-- Resolve compressed URL via Pink -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ResolveCompressedUrlsBatchedStage"
|
||||
pinkClientId="INGESTER"
|
||||
batchedStageBatchSize="10"
|
||||
tweetMaxAgeToResolve="10000"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Retrieve card information -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveCardBatchedStage"
|
||||
tweetypieClientId="ingester.prod"
|
||||
filterProtected="false"
|
||||
internalBatchSize="50"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Retrieve named entities -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveNamedEntitiesSingleTweetStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- retrieves space admins and title for a tweet if the tweet has space urls -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceAdminsAndTitleStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- extract text features of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextUrlsFeatureExtractionStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Compute the tweet signature -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ComputeTweetSignatureStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse the TwitterMessages into ThriftStatuses -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ConvertDelayedMessageToThriftStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
stageName="UpdateEvents"
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
driverFactoryId="kafka"/>
|
||||
</pipeline>
|
|
@ -1,240 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
|
||||
<!-- Ingesters process tweet create events from TweetyPie and write them to a queue for Earlybird
|
||||
to index. -->
|
||||
<pipeline>
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka">
|
||||
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<!-- Read tweets from the thrift kafka queue. The reader loops forever. -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.KafkaRawRecordConsumerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
kafkaConsumerGroupId=""
|
||||
maxPollRecords="1"
|
||||
pollTimeoutMs="1000"
|
||||
partitioned="false"
|
||||
deciderKey=""
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Deserialize the bytes into TweetData -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TweetEventDeserializerStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Filter to only have the safetytype for this cluster -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.FilterEventsBySafetyTypeStage"
|
||||
tweetCreateLatencyLogThresholdMillis="5000"
|
||||
safetyType="PUBLIC"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse to TwitterMessage -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ThriftTweetParserStage"
|
||||
tweetCreateEventBranchNames="kafka_retweet_and_reply"
|
||||
tweetDeleteEventBranchNames="kafka_update_events_delete"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<branch>
|
||||
<pipeline key="kafka_update_events_delete">
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka_update_events_delete">
|
||||
|
||||
<!-- we are willing to queue more deletes than other stages,
|
||||
to make sure we don't slow down the incoming tweets -->
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.DeleteUpdateEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
driverFactoryId="kafka_update_events_delete"/>
|
||||
</pipeline>
|
||||
</branch>
|
||||
|
||||
<!-- Processes retweets and replies to tweets -->
|
||||
<branch>
|
||||
<pipeline key="kafka_retweet_and_reply">
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka_retweet_and_reply">
|
||||
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<!-- An incoming reply to this stage can either be a tweet directed at someone using @mention, or
|
||||
a tweet that is a direct reply to another tweet. This stage filters retweets and tweets that are
|
||||
direct replies to other tweets into the retweet_and_reply pipeline -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.FilterRetweetsAndRepliesStage"
|
||||
driverFactoryId="kafka_retweet_and_reply"/>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ConvertToThriftVersionedEventsStage"
|
||||
driverFactoryId="kafka_retweet_and_reply"/>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.RetweetAndReplyUpdateEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
driverFactoryId="kafka_retweet_and_reply"/>
|
||||
</pipeline>
|
||||
</branch>
|
||||
|
||||
<!-- filters out messages that are not formatted correctly -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.FilterTwitterMessageStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- retrieves space ids from space urls if the tweet has space urls -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceIdsStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
|
||||
<!-- looks up user reputation scores for each message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.LookupUserPropertiesBatchedStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- extract text features of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextFeatureExtractionWorkersStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- compute text quality score of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextQualityEvaluationWorkerStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Extract lat/lon pairs from the text, and geocode them -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.SingleTweetExtractAndGeocodeLatLonStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- adds coded locations -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.PopulateCodedLocationsBatchedStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse the TwitterMessages into ThriftStatuses -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ConvertMessageToThriftStage"
|
||||
thriftVersionedEventsBranchName="kafka_base_tweets"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Branch for tweets -->
|
||||
<branch>
|
||||
<pipeline key="kafka_base_tweets">
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka_base_tweets">
|
||||
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName="search_ingester_indexing_events_realtime_prod"
|
||||
driverFactoryId="kafka_base_tweets"/>
|
||||
</pipeline>
|
||||
</branch>
|
||||
|
||||
<!-- Resolve compressed URL via Pink -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ResolveCompressedUrlsBatchedStage"
|
||||
pinkClientId="INGESTER"
|
||||
batchedStageBatchSize="10"
|
||||
tweetMaxAgeToResolve="10000"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Retrieve card information -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveCardBatchedStage"
|
||||
tweetypieClientId="ingester.prod"
|
||||
internalBatchSize="50"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Retrieve named entities -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveNamedEntitiesSingleTweetStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- retrieves space admins and title for a tweet if the tweet has space urls -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceAdminsAndTitleStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- extract text features of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextUrlsFeatureExtractionStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Compute the tweet signature -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ComputeTweetSignatureStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse the TwitterMessages into ThriftStatuses -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ConvertDelayedMessageToThriftStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
stageName="UpdateEvents"
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
driverFactoryId="kafka"/>
|
||||
</pipeline>
|
|
@ -1,199 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<!-- Ingesters process tweet create events from TweetyPie and write them to a queue for Earlybird
|
||||
to index. -->
|
||||
<pipeline>
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka">
|
||||
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<!-- Read tweets from the thrift kafka queue. The reader loops forever. -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.KafkaRawRecordConsumerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
kafkaConsumerGroupId=""
|
||||
maxPollRecords="1"
|
||||
pollTimeoutMs="1000"
|
||||
partitioned="false"
|
||||
deciderKey=""
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Deserialize the bytes into TweetData -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TweetEventDeserializerStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Filter to only have the safetytype = PUBLIC or PROTECTED -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.FilterEventsBySafetyTypeStage"
|
||||
tweetCreateLatencyLogThresholdMillis="5000"
|
||||
safetyType="PUBLIC_OR_PROTECTED"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse to TwitterMessage -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ThriftTweetParserStage"
|
||||
tweetDeleteEventBranchNames="kafka_update_events_delete"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<branch>
|
||||
<pipeline key="kafka_update_events_delete">
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka_update_events_delete">
|
||||
|
||||
<!-- we are willing to queue more deletes than other stages,
|
||||
to make sure we don't slow down the incoming tweets -->
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.DeleteUpdateEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
driverFactoryId="kafka_update_events_delete"/>
|
||||
</pipeline>
|
||||
</branch>
|
||||
|
||||
<!-- filters out messages that are not formatted correctly -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.FilterTwitterMessageStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- retrieves space ids from space urls if the tweet has space urls -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceIdsStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
|
||||
<!-- looks up user reputation scores for each message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.LookupUserPropertiesBatchedStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- extract text features of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextFeatureExtractionWorkersStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- compute text quality score of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextQualityEvaluationWorkerStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Extract lat/lon pairs from the text, and geocode them -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.SingleTweetExtractAndGeocodeLatLonStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- adds coded locations -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.PopulateCodedLocationsBatchedStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse the TwitterMessages into ThriftStatuses -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ConvertMessageToThriftStage"
|
||||
thriftVersionedEventsBranchName="kafka_base_tweets"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Branch for tweets -->
|
||||
<branch>
|
||||
<pipeline key="kafka_base_tweets">
|
||||
<property
|
||||
propName="validator"
|
||||
className="org.apache.commons.pipeline.validation.SimplePipelineValidator"/>
|
||||
<listener
|
||||
className="org.apache.commons.pipeline.listener.ObjectProcessedEventCounter"/>
|
||||
<driverFactory
|
||||
className="org.apache.commons.pipeline.driver.DedicatedThreadStageDriverFactory"
|
||||
id="kafka_base_tweets">
|
||||
|
||||
<property
|
||||
propName="queueFactory"
|
||||
className="org.apache.commons.pipeline.util.BlockingQueueFactory$ArrayBlockingQueueFactory"
|
||||
capacity="1000"
|
||||
fair="false"/>
|
||||
</driverFactory>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
driverFactoryId="kafka_base_tweets"/>
|
||||
</pipeline>
|
||||
</branch>
|
||||
|
||||
<!-- Resolve compressed URL via Pink -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ResolveCompressedUrlsBatchedStage"
|
||||
pinkClientId="INGESTER"
|
||||
batchedStageBatchSize="10"
|
||||
tweetMaxAgeToResolve="10000"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Retrieve card information -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveCardBatchedStage"
|
||||
tweetypieClientId=""
|
||||
internalBatchSize="50"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Retrieve named entities -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveNamedEntitiesSingleTweetStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- retrieves space admins and title for a tweet if the tweet has space urls -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.RetrieveSpaceAdminsAndTitleStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- extract text features of the message -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.TextUrlsFeatureExtractionStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Compute the tweet signature -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ComputeTweetSignatureStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<!-- Parse the TwitterMessages into ThriftStatuses -->
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.ConvertDelayedMessageToThriftStage"
|
||||
driverFactoryId="kafka"/>
|
||||
|
||||
<stage
|
||||
className="com.twitter.search.ingester.pipeline.twitter.kafka.TweetThriftVersionedEventsKafkaProducerStage"
|
||||
kafkaClusterPath=""
|
||||
stageName="UpdateEvents"
|
||||
kafkaClientId=""
|
||||
kafkaTopicName=""
|
||||
driverFactoryId="kafka"/>
|
||||
</pipeline>
|
Loading…
Reference in New Issue