Sha256: 32b2d0db9f638241433ce39cd622300611186ad44176c36da4f5a8872c94ad35

Contents?: true

Size: 1.69 KB

Versions: 27

Compression:

Stored size: 1.69 KB

Contents

#include "ruby.h"
#include "linguist.h"
#include "lex.linguist_yy.h"

// Anything longer is unlikely to be useful.
#define MAX_TOKEN_LEN 32

int linguist_yywrap(yyscan_t yyscanner) {
	return 1;
}

static VALUE rb_tokenizer_extract_tokens(VALUE self, VALUE rb_data) {
	YY_BUFFER_STATE buf;
	yyscan_t scanner;
	struct tokenizer_extra extra;
	VALUE ary, s;
	long len;
	int r;

	Check_Type(rb_data, T_STRING);

	len = RSTRING_LEN(rb_data);
	if (len > 100000)
		len = 100000;

	linguist_yylex_init_extra(&extra, &scanner);
	buf = linguist_yy_scan_bytes(RSTRING_PTR(rb_data), (int) len, scanner);

	ary = rb_ary_new();
	do {
		extra.type = NO_ACTION;
		extra.token = NULL;
		r = linguist_yylex(scanner);
		switch (extra.type) {
		case NO_ACTION:
			break;
		case REGULAR_TOKEN:
			len = strlen(extra.token);
			if (len <= MAX_TOKEN_LEN)
				rb_ary_push(ary, rb_str_new(extra.token, len));
			free(extra.token);
			break;
		case SHEBANG_TOKEN:
			len = strlen(extra.token);
			if (len <= MAX_TOKEN_LEN) {
				s = rb_str_new2("SHEBANG#!");
				rb_str_cat(s, extra.token, len);
				rb_ary_push(ary, s);
			}
			free(extra.token);
			break;
		case SGML_TOKEN:
			len = strlen(extra.token);
			if (len <= MAX_TOKEN_LEN) {
				s = rb_str_new(extra.token, len);
				rb_str_cat2(s, ">");
				rb_ary_push(ary, s);
			}
			free(extra.token);
			break;
		}
	} while (r);

	linguist_yy_delete_buffer(buf, scanner);
	linguist_yylex_destroy(scanner);

	return ary;
}

__attribute__((visibility("default"))) void Init_linguist() {
	VALUE rb_mLinguist = rb_define_module("Linguist");
	VALUE rb_cTokenizer = rb_define_class_under(rb_mLinguist, "Tokenizer", rb_cObject);

	rb_define_method(rb_cTokenizer, "extract_tokens", rb_tokenizer_extract_tokens, 1);
}

Version data entries

27 entries across 27 versions & 1 rubygems

Version Path
github-linguist-7.11.1 ext/linguist/linguist.c
github-linguist-7.10.0 ext/linguist/linguist.c
github-linguist-7.11.0 ext/linguist/linguist.c
github-linguist-7.9.0 ext/linguist/linguist.c
github-linguist-7.8.0 ext/linguist/linguist.c
github-linguist-7.7.0 ext/linguist/linguist.c
github-linguist-7.6.1 ext/linguist/linguist.c
github-linguist-7.6.0 ext/linguist/linguist.c
github-linguist-7.5.1 ext/linguist/linguist.c
github-linguist-7.5.0 ext/linguist/linguist.c
github-linguist-7.4.0 ext/linguist/linguist.c
github-linguist-7.3.1 ext/linguist/linguist.c
github-linguist-7.3.0 ext/linguist/linguist.c
github-linguist-7.2.0 ext/linguist/linguist.c
github-linguist-7.1.3 ext/linguist/linguist.c
github-linguist-7.1.2 ext/linguist/linguist.c
github-linguist-7.1.1 ext/linguist/linguist.c
github-linguist-7.1.0 ext/linguist/linguist.c
github-linguist-7.0.0 ext/linguist/linguist.c
github-linguist-6.4.1 ext/linguist/linguist.c