lib/twitterscraper/tweet.rb in twitterscraper-ruby-0.16.0 vs lib/twitterscraper/tweet.rb in twitterscraper-ruby-0.17.0
- old
+ new
@@ -4,10 +4,11 @@
class Tweet
KEYS = [
:screen_name,
:name,
:user_id,
+ :profile_image_url,
:tweet_id,
:text,
:links,
:hashtags,
:image_urls,
@@ -49,10 +50,15 @@
tweet['created_at'] = Time.parse(tweet['created_at'])
new(tweet)
end
end
+ # .js-stream-item
+ # .js-stream-tweet{data: {screen-name:, tweet-id:}}
+ # .stream-item-header
+ # .js-tweet-text-container
+ # .stream-item-footer
def from_html(text)
html = Nokogiri::HTML(text)
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
end
@@ -70,10 +76,12 @@
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
return nil
end
inner_html = Nokogiri::HTML(html.inner_html)
+
+ profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
video_url = inner_html.xpath("//div[@class[contains(., 'PlayableMedia-container')]]/a").map { |elem| elem.attr('href') }[0]
has_media = !image_urls.empty? || (video_url && !video_url.empty?)
@@ -97,9 +105,10 @@
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
new(
screen_name: screen_name,
name: html.attr('data-name'),
user_id: html.attr('data-user-id').to_i,
+ profile_image_url: profile_image_url,
tweet_id: tweet_id,
text: text,
links: links,
hashtags: text.scan(/#\w+/).map { |tag| tag.delete_prefix('#') },
image_urls: image_urls,