feat: reduce duplication in EncodedFeatureBuilder
This commit is contained in:
parent
743241984a
commit
eaeb4bc1d8
|
@ -5,6 +5,7 @@ import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
|
@ -140,7 +141,7 @@ public class EncodedFeatureBuilder {
|
||||||
// Extract some extra information from the message text.
|
// Extract some extra information from the message text.
|
||||||
// Index stock symbols with $ prepended
|
// Index stock symbols with $ prepended
|
||||||
textFeatures.getStocks().stream()
|
textFeatures.getStocks().stream()
|
||||||
.filter(stock -> stock != null)
|
.filter(Objects::nonNull)
|
||||||
.forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase()));
|
.forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase()));
|
||||||
|
|
||||||
// Question marks
|
// Question marks
|
||||||
|
@ -173,26 +174,23 @@ public class EncodedFeatureBuilder {
|
||||||
}
|
}
|
||||||
|
|
||||||
// User name features
|
// User name features
|
||||||
if (message.getFromUserDisplayName().isPresent()) {
|
message.getFromUserDisplayName().ifPresent(id -> {
|
||||||
Locale locale = LanguageIdentifierHelper
|
Locale locale = LanguageIdentifierHelper.identifyLanguage(id);
|
||||||
.identifyLanguage(message.getFromUserDisplayName().get());
|
String normalizedDisplayName = NormalizerHelper.normalize(id, locale, penguinVersion);
|
||||||
String normalizedDisplayName = NormalizerHelper.normalize(
|
|
||||||
message.getFromUserDisplayName().get(), locale, penguinVersion);
|
|
||||||
TokenizerResult result = TokenizerHelper
|
TokenizerResult result = TokenizerHelper
|
||||||
.tokenizeTweet(normalizedDisplayName, locale, penguinVersion);
|
.tokenizeTweet(normalizedDisplayName, locale, penguinVersion);
|
||||||
tokenSeqStream.reset(result.tokenSequence);
|
tokenSeqStream.reset(result.tokenSequence);
|
||||||
try {
|
try {
|
||||||
versionedTweetFeatures.setUserDisplayNameTokenStream(
|
versionedTweetFeatures.setUserDisplayNameTokenStream(
|
||||||
streamSerializer.serialize(tokenSeqStream));
|
streamSerializer.serialize(tokenSeqStream));
|
||||||
versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString());
|
versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString());
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
LOG.error("TwitterTokenStream serialization error! Could not serialize: "
|
LOG.error("TwitterTokenStream serialization error! Could not serialize: " + id);
|
||||||
+ message.getFromUserDisplayName().get());
|
|
||||||
SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment();
|
SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment();
|
||||||
versionedTweetFeatures.unsetUserDisplayNameTokenStream();
|
versionedTweetFeatures.unsetUserDisplayNameTokenStream();
|
||||||
versionedTweetFeatures.unsetUserDisplayNameTokenStreamText();
|
versionedTweetFeatures.unsetUserDisplayNameTokenStreamText();
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
|
String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
|
||||||
versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText);
|
versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText);
|
||||||
|
|
Loading…
Reference in New Issue