# encoding: UTF-8 module Quesadilla # Extract entities from text class Extractor require 'quesadilla/core_ext/string' Dir[File.expand_path('../extractor/*.rb', __FILE__)].each { |f| require f } include Autolinks include Emoji include Hashtags include HTML include Markdown # @return [Hash] default extractor options def self.default_options { markdown: true, markdown_code: true, markdown_links: true, markdown_triple_emphasis: true, markdown_double_emphasis: true, markdown_emphasis: true, markdown_strikethrough: true, hashtags: true, autolinks: true, emoji: true, html: true, html_renderer: Quesadilla::HTMLRenderer } end # @param options [Hash] an optional options hash. Defaults to `Quesadilla::Extractor.default_options`. # @option options [Boolean] Should extract Markdown. Defaults to `true`. # @option options markdown_code [Boolean] Should extract Markdown code. Defaults to `true`. # @option options markdown_links [Boolean] Should extract Markdown links. Defaults to `true`. # @option options markdown_triple_emphasis [Boolean] Should extract Markdown triple emphasis (bold italic). Defaults to `true`. # @option options markdown_double_emphasis [Boolean] Should extract Markdown double emphasis (bold). Defaults to `true`. # @option options markdown_emphasis [Boolean] Should extract Markdown emphasis (italic). Defaults to `true`. # @option options markdown_strikethrough [Boolean] Should extract Markdown strikethrough. Defaults to `true`. # @option options hashtags [Boolean] Should extract hashtags. Defaults to `true`. # @option options autolinks [Boolean] Should automatically detect links. Defaults to `true`. # @option options emoji [Boolean] Should extract named emoji. Defaults to `true`. # @option options html [Boolean] Should generate HTML. Defaults to `true`. # @option options html_renderer [Class] class to use as HTML renderer. Defaults to `Quesadilla::HTMLRenderer`. def initialize(options = {}) @options = self.class.default_options.merge(options) @renderer = @options[:html_renderer].new if @options[:html] end # Extract entities from text # @param original_text the text to extract from # @return [Hash] hash containing the display text, html text, and entities def extract(original_text) @original_text = original_text.dup # Emoji colon-syntax replace_emoji if @options[:emoji] @working_text = @original_text.dup @entities = [] # Get entities extract_markdown if @options[:markdown] extract_hashtags if @options[:hashtags] extract_autolinks if @options[:autolinks] # Sort entities @entities.sort! do |a, b| a[:indices].first <=> b[:indices].first end # Adjust display for each entity display_text = sub_entities(@original_text, @entities) # Return hash = { display_text: display_text, entities: @entities } hash[:display_html] = display_html(display_text, @entities) if @options[:html] hash end private # Invisible character from the reserved range replaces markdown we've already parsed. REPLACE_TOKEN = "\uf042".freeze def display_url(url) url = url.gsub(/(?:https?:\/\/)?(?:www\.)?/i, '').q_truncate(32, omission: '…') url = url[0...(url.length - 1)] if url[-1, 1] == '/' url end def quality_url(url) return url if url.include?('://') 'http://' + url end def sub_entities(input_text, entities, display = false, &block) # Adjust output text for each entity output_text = input_text offset = 0 entities.each do |entity| entity_original_text = display ? entity[:display_text] : entity[:text] entity_display_text = if block_given? yield(entity) else entity[:display_text] end indices = display ? entity[:display_indices] : entity[:indices] # Use the entity's display text instead of original text if they're different unless entity_original_text == entity_display_text # Get the fragment before the entity bf_end = indices[0] - 1 - offset before_frag = bf_end <= 0 ? '' : output_text[0..bf_end] # Get the fragment after the entity af_start = indices[1] - offset af_end = output_text.length - 1 after_frag = af_start > af_end ? '' : output_text[af_start..af_end] # Update the output text output_text = before_frag + entity_display_text + after_frag end # Update offset adjust = entity_original_text.length - entity_display_text.length unless display entity[:display_indices] = [entity[:indices][0] - offset, entity[:indices][1] - offset - adjust] end offset += adjust end output_text end end end