Sha256: a88c188d5e0cc7e68de6bbd13f5a8b46abfa8563fb1e4c4b5f0c0561173872de

Contents?: true

Size: 1 KB

Versions: 7

Compression:

Stored size: 1 KB

Contents

#include "text_pipeline.h"
#include <iostream>

DataSet::SparseExample *Preprocessing::Text::TextPipeline::process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features) {
  tokens.clear();
  tokeniser->tokenise(text);
  return generator->generate(data_set, &tokens, create_features);
}

void Preprocessing::Text::TextPipeline::process_token(char *start, char *end) {
  for(int i = 0; i < processors.size(); i++)
    processors[i]->process(start, end);
  
  for(int i = 0; i < selectors.size(); i++)
    if(!selectors[i]->select(start, end))
      return;
  
  tokens.push_back(start);
}

Preprocessing::Text::TextPipeline *Preprocessing::Text::StandardPipeline() {
  TextPipeline *pipeline = new TextPipeline();
  pipeline->tokeniser = new SimpleTokeniser(pipeline);
  pipeline->processors.push_back(new Downcase());
  pipeline->processors.push_back(new PorterStemmer());
  pipeline->selectors.push_back(new StopWords());
  pipeline->generator = new TokenCounter(TokenCounter::Local);
  return pipeline;
}

Version data entries

7 entries across 7 versions & 1 rubygems

Version Path
thera-0.0.8 lib/quarry/src/preprocessing/text/text_pipeline.cpp
thera-0.0.7 lib/quarry/src/preprocessing/text/text_pipeline.cpp
thera-0.0.6 lib/quarry/src/preprocessing/text/text_pipeline.cpp
thera-0.0.5 lib/quarry/src/preprocessing/text/text_pipeline.cpp
thera-0.0.4 lib/quarry/src/preprocessing/text/text_pipeline.cpp
thera-0.0.3 lib/quarry/src/preprocessing/text/text_pipeline.cpp
thera-0.0.2 lib/quarry/src/preprocessing/text/text_pipeline.cpp