Nokogiri scraping with Ruby On Rails not working as expected - ruby-on-rails

I'm completely new to Ruby on Rails but I think I might be missing something obvious. I'm currently working on a webapp that scrapes auction websites. The bones of the app was created by someone else. I'm currently trying to add new website scrapes but they don't seem to be working.
I have read through some of the Nokogiri documentation, checked that the scraped information is indeed not being written to the database (the seeded URLs that are being targeted have been when I check via the rails console) and used the chrome extension CSS Selector Tester to check that I am targeting the correct CSS selectors. The record ids are correct when I check via the rails console.
I have put what I think are the important sections of code below, but I might be missing something that I don't realise is important.
The websites I'm having issues with are Lot-art.com & Lot-Tissimo.com
Any help will be much appreciated.
Seeded URLs
Source.create(name: "Auction.fr", query_template: "https://www.auction.fr/_en/lot/search/?contexte=futures&tri=date_debut%20ASC&query={query}&page={page}")
Source.create(name: "Invaluable.co.uk", query_template: "https://www.invaluable.co.uk/search/api/search-results?keyword={query}&size=1000")
Source.create(name: "Interencheres.com", query_template: "http://www.interencheres.com/en/recherche/lot?search%5Bkeyword%5D={query}&page={page}")
Source.create(name: "Gazette-drouot.com", query_template: "http://catalogue.gazette-drouot.com/html/g/recherche.jsp?numPage={page}&filterDate=1&query={query}&npp=100")
Source.create(name: "Lot-art.com", query_template: "http://www.lot-art.com/auction-search/?form_id=lot_search_form&page=1&mq=&q={query}&ord=recent")
Source.create(name: "Lot-tissimo.com", query_template: "https://lot-tissimo.com/en/cmd=s&lwr=&ww={query}&xw=&srt=SN&wg=EUR&page={page}")
Scheduler code
require 'rufus-scheduler'
require 'nokogiri'
require 'mechanize'
require 'open-uri'
require "net/https"
s = Rufus::Scheduler.singleton
s.interval '1m' do
setting = Setting.find(1)
agent = Mechanize.new
agent.user_agent_alias = 'Windows Chrome'
agent.cookie_jar.load(File.join(Rails.root, 'tmp/cookies.yaml'))
List.all.each do |list|
number_of_new_items = 0
list.actions.each do |action|
url = action.source.query_template.gsub('{query}', action.list.query)
case action.source.id
when 1 # Auction.fr
20.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("div.list-products > ul > li").reverse.each do |item_data|
price = 0
if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
end
item = action.items.new(
title: item_data.at_css("h2").text.strip,
url: item_data.at_css("h2 a")["href"],
picture: item_data.at_css("div.image-wrap.lazy div.image img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 97 # Lot-Tissimo.com
5.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("#inhalt > .objektliste").reverse.each do |item_data|
# price = 0
# if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
# price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
# end
item = action.items.new(
title: item_data.at_css("div.objli-desc").text.strip,
url: item_data.at_css("td.objektliste-foto a")["href"],
picture: item_data.at_css("td.objektliste-foto a#lot_link img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 2 # Invaluable.co.uk
doc = JSON.parse(open(url).read)
doc["itemViewList"].reverse.each do |item_data|
puts item_data["itemView"]["photos"]
item = action.items.new(
title: item_data["itemView"]["title"],
url: "https://www.invaluable.co.uk/buy-now/" + item_data["itemView"]["title"].parameterize + "-" + item_data["itemView"]["ref"],
picture: item_data["itemView"]["photos"] != nil ? item_data["itemView"]["photos"].first["_links"]["medium"]["href"] : nil,
price: item_data["itemView"]["price"],
currency: item_data["itemView"]["currencySymbol"]
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
when 3 # Interencheres.com
# doc = Nokogiri::HTML(open(url))
5.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("div#lots_0 div.ligne_vente").reverse.each do |item_data|
price = 0
item = action.items.new(
title: item_data.at_css("div.ph_vente div.des_vente p a").text.strip,
url: "http://www.interencheres.com" + item_data.at_css("div.ph_vente div.des_vente p a")["href"],
picture: item_data.at_css("div.ph_vente div.gd_ph_vente img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 4 # Gazette-drouot.com
5.downto(1) do |page|
# doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc = agent.get(url.gsub('{page}', page.to_s))
# doc = agent.get(url)
doc.css("div#recherche_resultats div.lot_recherche").reverse.each do |item_data|
price = 0
picture = item_data.at_css("img.image_thumb_recherche") ? item_data.at_css("img.image_thumb_recherche")["src"] : nil
item = action.items.new(
title: item_data.at_css("#des_recherche").text.strip.truncate(140),
url: "http://catalogue.gazette-drouot.com/html/g/" + item_data.at_css("a.lien_under")["href"],
picture: picture,
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 69 # Lot-art.com
doc = agent.get(url)
doc.css("div.lot_list_holder").reverse.each do |item_data|
price = 0
item = action.items.new(
title: item_data.at_css("div.lot_list_body a")[0].text.strip.truncate(140),
url: item_data.at_css("div.lot_list_body")["href"],
picture: item_data.at_css("a.lot_list_thumb img") ["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
end
if number_of_new_items > 0 && setting.notifications_per_hour > setting.notifications_this_hour && setting.pushover_app_token.present? && setting.pushover_user_key.present?
url = URI.parse("https://api.pushover.net/1/messages.json")
req = Net::HTTP::Post.new(url.path)
req.set_form_data({
:token => setting.pushover_app_token,
:user => setting.pushover_user_key,
:message => "#{number_of_new_items} new items on #{list.name}!",
:url_title => "Check the list",
:url => "http://spottheauction.com/lists/#{list.id}"
})
res = Net::HTTP.new(url.host, url.port)
res.use_ssl = true
res.verify_mode = OpenSSL::SSL::VERIFY_PEER
res.start {|http| http.request(req) }
end
end
agent.cookie_jar.save(File.join(Rails.root, 'tmp/cookies.yaml'))
end
s.cron '0 * * * *' do
setting = Setting.find(1)
setting.notifications_this_hour = 0
setting.save
end

new just initializes an instance but doesn't save the instance. Do you actually call save somewhere?
You have two options:
Call save on the item:
item = action.items.new(
# ...
)
item.save
Or use create instead of new:
item = action.items.create(
# ...
)

In case someone else comes across this. I got the scraping of lot-art.com to work. It seemed that I was lacking specificity in the css selector for nokogiri to pull the correct data.
I am still having continuing issues with lot-tissimo although that appears to be from something else as other scrapers have issues such as scraping-hub's portia spiders.

Related

Login to amazon partnernet using Ruby Mechanize

I try to login into amazon partnernet, e.g. https://partnernet.amazon.de/ using the Ruby Mechanize gem:
Gemfile:
# https://github.com/sparklemotion/mechanize
gem 'mechanize'
The code below is a rake task. It worked in the past, I think Amazon changed the page html so this code is no longer working, e.g. by changing the submit button of the form name="sign_in" to an image type=input.
desc "Cron Task for Email Notifications"
task :email_amazon_stats => :environment do
puts "Start: Fetch and send Amazon Sales from yesterday (#{Time.now})"
# login to Amazon Partnernet
a = Mechanize.new
a.user_agent_alias = 'Mac Safari'
a.follow_meta_refresh = true
a.redirect_ok = true
a.get('https://partnernet.amazon.de/') do |page|
# Submit the login form
page.form_with(:name => 'sign_in') do |f|
username_field = f.field_with(:id => "username")
username_field.value = "email#example.com"
password_field = f.field_with(:id => "password")
password_field.value = "somepassword"
end.submit
start_date = Time.now - 1.day
end_date = Time.now
my_page2 = a.get("https://partnernet.amazon.de/gp/associates/network/reports/report.html?ie=UTF8&deviceType=all&endDay=#{(end_date.strftime('%d').to_i).to_s}&endMonth=#{((end_date.strftime('%m').to_i)-1).to_s}&endYear=#{end_date.strftime('%Y').to_i.to_s}&periodType=exact&preSelectedPeriod=monthToDate&program=all&reportType=earningsReport&startDay=#{start_date.strftime('%d').to_i.to_s}&startMonth=#{((start_date.strftime('%m').to_i)-1).to_s}&startYear=#{start_date.strftime('%Y').to_s}")
form = my_page2.form_with(:name => 'htmlReport')
button = form.button_with(:name => 'submit.download_XML')
xml = a.submit(form, button)
# ASIN="3423347570"
# Binding="paperback"
# Category="14"
# Date="December 01, 2015"
# DeviceType="BROWSER"
# EDate="1448928000"
# Earnings="0,65"
# LinkType="asn"
# Price="9,25"
# Qty="1"
# Rate="7,03"
# Revenue="9,25"
# Seller="Amazon.de"
# Tag="yx-21"
# Title="Kopf schlägt Kapital: Die ganz andere Art, ein Unternehmen zu gründen Von der Lust, ein Entrepreneur zu sein (dtv Sachbuch)"/>
doc = Nokogiri::XML(xml.body)
#sales = []
doc.xpath("//Item").each do |item|
#sales << {
:sale_itemasin => item['ASIN'],
:sale_itemname => item['Title'].truncate(80),
:sale_date => Time.at(item['EDate'].to_i).strftime("%Y-%m-%d %H:%M:%S").to_s,
:sale_amount => '%.2f' % item['Revenue'].gsub(',','.').to_f,
:sale_commission => '%.2f' % item['Earnings'].gsub(',','.').to_f
}
end
earnings = 0
#sales.each do |s|
earnings += s[:sale_commission].to_f
end
#total_commission = '%.2f' % earnings
end
ReportsMailer.daily_dashboard(#total_commission,#sales).deliver
puts "Done: Fetch and send Amazon Sales from yesterday (#{Time.now})"
end
Can someone help me in this?
--
I looked for similar questions how to restructure the submit, but so far nothing works. Login is not happening. (Yes, PWD is correct :-) )
Similar question, but does not solve the problem above: Cannot Login to Amazon with Ruby Mechanize
So.. I debugged the code, this version is working now as expected:
desc "Cron Task for Email Notifications"
task :email_amazon_stats => :environment do
puts "Start: Fetch and send Amazon Sales from yesterday (#{Time.now})"
agent = Mechanize.new
agent.cookie_jar.clear!
agent.user_agent_alias = 'Mac Firefox'
agent.follow_meta_refresh = true
agent.redirect_ok = true
dashboard_url = "https://partnernet.amazon.de/gp/associates/network/reports/report.html?__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&tag=&reportType=earningsReport&program=all&deviceType=all&periodType=preSelected&preSelectedPeriod=yesterday&startDay=1&startMonth=11&startYear=2016&endDay=2&endMonth=11&endYear=2016&submit.display.x=87&submit.display.y=16&submit.display=Auf+der+Seite+anzeigen"
agent.get(dashboard_url)
form = agent.page.form_with(:name => 'sign_in')
form.username = ENV['AZON_PARTNER_USR']
form.password = ENV['AZON_PARTNER_KEY']
form.submit
dashboard = agent.get(dashboard_url)
form2 = dashboard.form_with(:name => 'htmlReport')
button = form2.button_with(:name => 'submit.download_XML')
xml = agent.submit(form2, button)
doc = Nokogiri::XML(xml.body)
#sales = []
doc.xpath("//Item").each do |item|
#sales << {
:sale_itemasin => item['ASIN'],
:sale_itemname => item['Title'].truncate(80),
:sale_date => Time.at(item['EDate'].to_i).strftime("%Y-%m-%d %H:%M:%S").to_s,
:sale_amount => '%.2f' % item['Revenue'].gsub(',','.').to_f,
:sale_commission => '%.2f' % item['Earnings'].gsub(',','.').to_f
}
end
earnings = 0
#sales.each do |s|
earnings += s[:sale_commission].to_f
end
#total_commission = '%.2f' % earnings
ReportsMailer.daily_dashboard(#total_commission,#sales).deliver
puts "Done: Fetch and send Amazon Sales from yesterday (#{Time.now})"
end
As you can see yourself this is pretty ugly, because I try to go to the deeplink directly, which redirects me to the login page. There I login and try again to go to the dashboard. This time it works. Why ugly? Because if I try to go to the login page directly the code does not work, I somehow need this redirect. Any idea why? Would be interesting to understand this...

Scraping data in rails using thread

I am doing scraping to fetch the data from the website to my database in rails.I am fetching the 32000 record with this script there isn't any issue but i want to fetch the data faster so i apply the thread in my rake task but then there is a issue while running the rake task some of the data is fetching then the rake task getting aborted.
I am not aware of what to do task if any help can be done i am really grateful . Here is my rake task code for the scraping.
task scratch_to_database: :environment do
time2 = Time.now
puts "Current Time : " + time2.inspect
client = Mechanize.new
giftcard_types=Giftcard.card_types
find_all_merchant=Merchant.all.pluck(:id, :name).to_h
#first index page of the merchant
index_page = client.get('https://www.twitter.com//')
document_page_index = Nokogiri::HTML::Document.parse(index_page.body)
#set all merchant is deteled true
# set_merchant_as_deleted = Merchant.update_all(is_deleted: true) if Merchant.exists?
# set_giftcard_as_deleted = Giftcard.update_all(is_deleted: true) if Giftcard.exists?
update_all_merchant_record = []
update_all_giftcard_record = []
threads = []
#Merchant inner page pagination loop
page_no_merchant = document_page_index.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
1.upto(page_no_merchant) do |page_number|
threads << Thread.new do
client.get("https://www.twitter.com/buy-gift-cards?page=#{page_number}") do |page|
document = Nokogiri::HTML::Document.parse(page.body)
#Generate the name of the merchant and image of the merchant loop
document.css('.product-source').each do |item|
merchant_name= item.children.css('.name').text.gsub("Gift Cards", "")
href = item.css('a').first.attr('href')
image_url=item.children.css('.img img').attr('data-src').text.strip
#image url to parse the url of the image
image_url=URI.parse(image_url)
#saving the record of the merchant
# #merchant=Merchant.create(name: merchant_name , image_url:image_url)
if find_all_merchant.has_value?(merchant_name)
puts "this if"
merchant_id=find_all_merchant.key(merchant_name)
puts merchant_id
else
#merchant= Merchant.create(name: merchant_name , image_url:image_url)
update_all_merchant_record << #merchant.id
merchant_id=#merchant.id
end
# #merchant.update_attribute(:is_deleted, false)
#set all giftcard is deteled true
# set_giftcard_as_deleted = Giftcard.where(merchant_id: #merchant.id).update_all(is_deleted: true) if Giftcard.where(merchant_id: #merchant.id).exists?
#first page of the giftcard details page
first_page = client.get("https://www.twitter.com#{href}")
document_page = Nokogiri::HTML::Document.parse(first_page.body)
page_no = document_page.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
hrefextra =document_page.css('.dropdown-menu li a').last.attr('href')
#generate the giftcard details loop with the pagination
# update_all_record = []
find_all_giftcard=Giftcard.where(merchant_id:merchant_id).pluck(:row_id)
puts merchant_name
# puts find_all_giftcard.inspect
card_page = client.get("https://www.twitter.com#{hrefextra}")
document_page = Nokogiri::HTML::Document.parse(card_page.body)
#table details to generate the details of the giftcard with price ,per_off and final value of the giftcard
document_page.xpath('//table/tbody/tr[#class="toggle-details"]').collect do |row|
type1=[]
row_id = row.attr("id").to_i
row.at("td[2] ul").children.each do |typeli|
type = typeli.text.strip if typeli.text.strip.length != 0
type1 << type if typeli.text.strip.length != 0
end
value = row.at('td[3]').text.strip
value = value.to_s.tr('$', '').to_f
per_discount = row.at('td[4]').text.strip
per_discount = per_discount.to_s.tr('%', '').to_f
final_price = row.at('td[5] strong').text.strip
final_price = final_price.to_s.tr('$', '').to_f
type1.each do |type|
if find_all_giftcard.include?(row_id)
update_all_giftcard_record<<row_id
puts "exists"
else
puts "new"
#giftcard= Giftcard.create(card_type: giftcard_types.values_at(type.to_sym)[0], card_value:value, per_off:per_discount, card_price: final_price, merchant_id: merchant_id , row_id: row_id )
update_all_giftcard_record << #giftcard.row_id
end
end
#saving the record of the giftcard
# #giftcard=Giftcard.create(card_type:1, card_value:value, per_off:per_discount, card_price: final_price, merchant_id: #merchant.id , gift_card_type: type1)
end
# Giftcard.where(:id =>update_all_record).update_all(:is_deleted => false)
#delete all giftcard which is not present
# giftcard_deleted = Giftcard.where(:is_deleted => true,:merchant_id => #merchant.id).destroy_all if Giftcard.where(merchant_id: #merchant.id).exists?
time2 = Time.now
puts "Current Time : " + time2.inspect
end
end
end
end
threads.each(&:join)
puts "-------"
puts threads
# merchant_deleted = Merchant.where(:is_deleted => true).destroy_all if Merchant.exists?
merchant_deleted = Merchant.where('id NOT IN (?)',update_all_merchant_record).destroy_all if Merchant.exists?
giftcard_deleted = Giftcard.where('row_id NOT IN (?)',update_all_giftcard_record).destroy_all if Giftcard.exists?
end
end
Error i am receiving:
ActiveRecord::ConnectionTimeoutError: could not obtain a connection from the pool within 5.000 seconds (waited 5.001 seconds); all pooled connections were in use
Each thread requires a separate connection to your database. You need to increase the connection pool size that your application can use in your database.yml file.
But your database should also be capable of handling the incoming connections. If you are using mysql you can check this by running select ##MAX_CONNECTIONS on your console.

Screen Scraping with nokogiri

I am a full stack ruby developer.I am trying to scrape to the data from the website and i am successfully able to get the data.But the problem is that next time when i fetched the data i just want to fetch only new data the i don't want to overwrite all the the data in the database.
I just want to add new record which added recently.But i am not able to find any solution for that how to do it with minimum queries and minimum code.
Here is my code which i am using for scrapping:
client = Mechanize.new
index_page = client.get('https://www.google.com/')
document_page_index = Nokogiri::HTML::Document.parse(index_page.body)
page_no_merchant = document_page_index.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
1.upto(page_no_merchant) do |page_number|
client.get("https://www.google.com/buy-gift-cards?page=#{page_number}") do |page|
document = Nokogiri::HTML::Document.parse(page.body)
document.css('.product-source').each do |item|
merchant_name= item.children.css('.name').text.gsub("Gift Cards", "")
puts merchant_name
href = item.css('a').first.attr('href')
puts href
image_url=item.children.css('.img img').attr('data-src').text.strip
puts image_url
image_url=URI.parse(image_url)
#merchant=Merchant.create!(name: merchant_name , image_url:image_url)
first_page = client.get("https://www.google.com#{href}")
document_page = Nokogiri::HTML::Document.parse(first_page.body)
page_no = document_page.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
1.upto(page_no) do |page_number_giftcard|
type1=[]
card_page = client.get("https://www.google.com#{href}?page=#{page_number_giftcard}")
document_page = Nokogiri::HTML::Document.parse(card_page.body)
document_page.xpath('//table/tbody/tr[#class="toggle-details"]').collect do |row|
row.at("td[2] ul").children.each do |typeli|
type = typeli.text.strip if typeli.text.strip.length != 0
type1 << type if typeli.text.strip.length != 0
end
value = row.at('td[3]').text.strip
value = value.to_s.tr('$', '').to_f
puts value
per_discount = row.at('td[4]').text.strip
per_discount = per_discount.to_s.tr('%', '').to_f
puts per_discount
final_price = row.at('td[5] strong').text.strip
final_price = final_price.to_s.tr('$', '').to_f
puts final_price
puts '******************************'
#giftcard=Giftcard.create(card_type:1, card_value:value, per_off:per_discount, card_price: final_price, merchant_id: #merchant.id)
end
#giftcard.update_attribute()
end
end
end
end
Thank you in advance.
Basically you are saving all data, by doing this.
#merchant=Merchant.create!(name: merchant_name , image_url:image_url)
You can try something like find_or_create_by.
#merchant=Merchant.find_or_create_by(name: merchant_name , image_url:image_url)
http://apidock.com/rails/v4.0.2/ActiveRecord/Relation/first_or_create
http://apidock.com/rails/v4.0.2/ActiveRecord/Relation/find_or_create_by

A ruby script and a rails model: Same code - different behaviour

I have coded the following ruby script:
require 'open-uri'
require 'Nokogiri'
require 'anemone'
class JobFox
attr_accessor :company_url,
:jobs_page,
:max_words,
:jobs_part,
:jobs_container,
:element_score,
:max_score,
:jobs
def calc_element_score(element)
self.element_score += (element['class'].to_s.scan(/job|career|position|opening/).count + element['id'].to_s.scan(/job|career|position|opening/).count) * 100
self.element_score += element.to_s.scan(/job|career|position|opening/).count * 5
element.css('a').each do |a|
self.element_score += a.to_s.scan(/job|career|position|opening/).count * 7
end
element.css('li').each do |li|
self.element_score += li.to_s.scan(/job|career|position|opening/).count * 5
end
element.css('h').each do |h|
self.element_score += h.to_s.scan(/job|career|position|opening/).count * 3
end
if self.element_score > self.max_score
self.max_score = self.element_score
self.jobs_part = element
end
if element.children.count == 0
self.element_score = 0
end
end
end
fox = JobFox.new
fox.company_url = 'http://www.website.com'
fox.max_words = 0
fox.jobs = []
# CRAWL THE WEBSITE TO FIND THE JOBS LINK
Anemone.crawl(fox.company_url, :depth_limit => 3) do |anemone|
anemone.on_pages_like(/job|jobs|career|careers|team|about/) do |page|
begin
puts "SCANNING: " + page.url.to_s
# SCAN THE HTML AND FIND THE OCCURENCES OF THE WORD "JOB"
source_html = open(page.url).read
job_occurences = source_html.scan(/job|jobs|work|position/).count
# IF MORE OCCURENCES THAN BEFORE, WE KEEP THE PAGE URL
if job_occurences > fox.max_words
fox.max_words = job_occurences
fox.jobs_page = page.url
end
rescue Exception => e
puts e
end
end
end
fox.jobs_container = Nokogiri::HTML(open(fox.jobs_page))
fox.element_score = fox.max_score = 0
fox.jobs_container.css('div, section').each do |container|
container.traverse do |element|
fox.calc_element_score(element)
end
end
fox.jobs_part.traverse do |element|
element.css('a').each do |job|
fox.jobs << job.text
end
end
# REMOVE POSSIBLE DUPLICATE ENTRIES
fox.jobs = fox.jobs.uniq
puts fox.jobs
and I am trying to port it to a rails application - not as a script/task but as a model function:
require 'anemone'
require 'open-uri'
require 'Nokogiri'
class Company < ActiveRecord::Base
has_many :jobs
accepts_nested_attributes_for :jobs
# CALCULATE THE RELATEDNESS OF EACH HTML ELEMENT
def calculate_element_score(element)
#jobs_expression = '/job|career|position|opening/'
#element_score += (element['class'].to_s.scan(#jobs_expression).count + element['id'].to_s.scan(#jobs_expression).count) * 100
#element_score += element.to_s.scan(#jobs_expression).count * 5
element.css('a').each do |a|
#element_score += a.to_s.scan(#jobs_expression).count * 7
end
element.css('li').each do |li|
#element_score += li.to_s.scan(#jobs_expression).count * 5
end
element.css('h').each do |h|
#element_score += h.to_s.scan(#jobs_expression).count * 3
end
if #element_score > #max_score
#max_score = #element_score
#jobs_part = element
end
if element.children.count == 0
#element_score = 0
end
end
# CRAWL THE WEBSITE TO FIND THE JOBS PAGE
def find_jobs_page
max_words = 0
Anemone.crawl(self.website, :depth_limit => 3) do |anemone|
anemone.on_pages_like(/job|jobs|career|careers|team|about/) do |page|
begin
# SCAN THE HTML AND FIND OCCURENCES OF RELEVANT WORDS
source_html = open(page.url).read
job_occurences = source_html.scan(/job|jobs|work|position/).count
# IF MORE OCCURENCES THAN BEFORE, KEEP THE PAGE URL
if job_occurences > max_words
max_words = job_occurences
self.jobs_page = page.url
end
rescue Exception => e
puts e
end
end
end
end
# FIND THE CONTAINER THAT HAS THE JOB LISTINGS
def find_jobs_container
jobs_container = Nokogiri::HTML(open(self.jobs_page))
#element_score = #max_score = 0
#jobs_expression = '/job|career|position|opening/'
jobs_container.css('div, section').each do |container|
container.traverse do |element|
self.calculate_element_score(element)
end
end
end
# ADD THE JOBS FROM THE PAGE TO THE COMPANY ASSOCIATION
def extract_jobs
#jobs_part.traverse do |element|
element.css('a').each do |job|
j = JOBS.new()
j.title = job.text
j.url = job
self.jobs << j
end
end
end
# THE METHOD TO FIND ALL THE JOBS FOR A COMPANY
def find_jobs
self.find_jobs_page
self.find_jobs_container
self.extract_jobs
end
end
Everything works just fine apart from the calculate_element_score method - #elements_score is always 0. Have I understood something entirely wrong regarding global variables?

Gem Resque Error - Undefined "method perform" after Overriding it form the super class

First of all Thanks for you all for helping programmers like me with your valuable inputs in solving day to day issues.
This is my first question in stack overflow as I am experiencing this problems from almost one week.
WE are building a crawler which crawls the specific websites and extract the contents from it, we are using mechanize to acheive this , as it was taking loads of time we decided to run the crawling process as a background task using resque with redis gem , but while sending the process to background I am experiencing the error as the title saying,
my code in lib/parsers/home.rb
require 'resque'
require File.dirname(__FILE__)+"/../index"
class Home < Index
Resque.enqueue(Index , :page )
def self.perform(page)
super (page)
search_form = page.form_with :name=>"frmAgent"
resuts_page = search_form.submit
total_entries = resuts_page.parser.xpath('//*[#id="PagingTable"]/tr[2]/td[2]').text
if total_entries =~ /(\d+)\s*$/
total_entries = $1
else
total_entries = "unknown"
end
start_res_idx = 1
while true
puts "Found #{total_entries} entries"
detail_links = resuts_page.parser.xpath('//*[#id="MainTable"]/tr/td/a')
detail_links.each do |d_link|
if d_link.attribute("class")
next
else
data_page = #agent.get d_link.attribute("href")
fields = get_fields_from_page data_page
save_result_page page.uri.to_s, fields
#break
end
end
site_done
rescue Exception => e
puts "error: #{e}"
end
end
and the superclass in lib/index.rb is
require 'resque'
require 'mechanize'
require 'mechanize/form'
class Index
#queue = :Index_queue
def initialize(site)
#site = site
#agent = Mechanize.new
#agent.user_agent = Mechanize::AGENT_ALIASES['Windows Mozilla']
#agent.follow_meta_refresh = true
#rows_parsed = 0
#rows_total = 0
rescue Exception => e
log "Unable to login: #{e.message}"
end
def run
log "Parsing..."
url = "unknown"
if #site.url
url = #site.url
log "Opening #{url} as a data page"
#page = #agent.get(url)
#perform method should be override in subclasses
#data = self.perform(#page)
else
#some sites do not have "datapage" URL
#for example after login you're already on your very own datapage
#this is to be addressed in 'perform' method of subclass
#data = self.perform(nil)
end
rescue Exception=>e
puts "Failed to parse URL '#{url}', exception=>"+e.message
set_site_status("error "+e.message)
end
#overriding method
def self.perform(page)
end
def save_result_page(url, result_params)
result = Result.find_by_sql(["select * from results where site_id = ? AND ref_code = ?", #site.id, utf8(result_params[:ref_code])]).first
if result.nil?
result_params[:site_id] = #site.id
result_params[:time_crawled] = DateTime.now().strftime "%Y-%m-%d %H:%M:%S"
result_params[:link] = url
result = Result.create result_params
else
result.result_fields.each do |f|
f.delete
end
result.link = url
result.time_crawled = DateTime.now().strftime "%Y-%m-%d %H:%M:%S"
result.html = result_params[:html]
fields = []
result_params[:result_fields_attributes].each do |f|
fields.push ResultField.new(f)
end
result.result_fields = fields
result.save
end
#rows_parsed +=1
msg = "Saved #{#rows_parsed}"
msg +=" of #{#rows_total}" if #rows_total.to_i > 0
log msg
return result
end
end
What's Wrong with this code?
Thanks

Resources