Scraping data in rails using thread - ruby-on-rails

I am doing scraping to fetch the data from the website to my database in rails.I am fetching the 32000 record with this script there isn't any issue but i want to fetch the data faster so i apply the thread in my rake task but then there is a issue while running the rake task some of the data is fetching then the rake task getting aborted.
I am not aware of what to do task if any help can be done i am really grateful . Here is my rake task code for the scraping.
task scratch_to_database: :environment do
time2 = Time.now
puts "Current Time : " + time2.inspect
client = Mechanize.new
giftcard_types=Giftcard.card_types
find_all_merchant=Merchant.all.pluck(:id, :name).to_h
#first index page of the merchant
index_page = client.get('https://www.twitter.com//')
document_page_index = Nokogiri::HTML::Document.parse(index_page.body)
#set all merchant is deteled true
# set_merchant_as_deleted = Merchant.update_all(is_deleted: true) if Merchant.exists?
# set_giftcard_as_deleted = Giftcard.update_all(is_deleted: true) if Giftcard.exists?
update_all_merchant_record = []
update_all_giftcard_record = []
threads = []
#Merchant inner page pagination loop
page_no_merchant = document_page_index.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
1.upto(page_no_merchant) do |page_number|
threads << Thread.new do
client.get("https://www.twitter.com/buy-gift-cards?page=#{page_number}") do |page|
document = Nokogiri::HTML::Document.parse(page.body)
#Generate the name of the merchant and image of the merchant loop
document.css('.product-source').each do |item|
merchant_name= item.children.css('.name').text.gsub("Gift Cards", "")
href = item.css('a').first.attr('href')
image_url=item.children.css('.img img').attr('data-src').text.strip
#image url to parse the url of the image
image_url=URI.parse(image_url)
#saving the record of the merchant
# #merchant=Merchant.create(name: merchant_name , image_url:image_url)
if find_all_merchant.has_value?(merchant_name)
puts "this if"
merchant_id=find_all_merchant.key(merchant_name)
puts merchant_id
else
#merchant= Merchant.create(name: merchant_name , image_url:image_url)
update_all_merchant_record << #merchant.id
merchant_id=#merchant.id
end
# #merchant.update_attribute(:is_deleted, false)
#set all giftcard is deteled true
# set_giftcard_as_deleted = Giftcard.where(merchant_id: #merchant.id).update_all(is_deleted: true) if Giftcard.where(merchant_id: #merchant.id).exists?
#first page of the giftcard details page
first_page = client.get("https://www.twitter.com#{href}")
document_page = Nokogiri::HTML::Document.parse(first_page.body)
page_no = document_page.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
hrefextra =document_page.css('.dropdown-menu li a').last.attr('href')
#generate the giftcard details loop with the pagination
# update_all_record = []
find_all_giftcard=Giftcard.where(merchant_id:merchant_id).pluck(:row_id)
puts merchant_name
# puts find_all_giftcard.inspect
card_page = client.get("https://www.twitter.com#{hrefextra}")
document_page = Nokogiri::HTML::Document.parse(card_page.body)
#table details to generate the details of the giftcard with price ,per_off and final value of the giftcard
document_page.xpath('//table/tbody/tr[#class="toggle-details"]').collect do |row|
type1=[]
row_id = row.attr("id").to_i
row.at("td[2] ul").children.each do |typeli|
type = typeli.text.strip if typeli.text.strip.length != 0
type1 << type if typeli.text.strip.length != 0
end
value = row.at('td[3]').text.strip
value = value.to_s.tr('$', '').to_f
per_discount = row.at('td[4]').text.strip
per_discount = per_discount.to_s.tr('%', '').to_f
final_price = row.at('td[5] strong').text.strip
final_price = final_price.to_s.tr('$', '').to_f
type1.each do |type|
if find_all_giftcard.include?(row_id)
update_all_giftcard_record<<row_id
puts "exists"
else
puts "new"
#giftcard= Giftcard.create(card_type: giftcard_types.values_at(type.to_sym)[0], card_value:value, per_off:per_discount, card_price: final_price, merchant_id: merchant_id , row_id: row_id )
update_all_giftcard_record << #giftcard.row_id
end
end
#saving the record of the giftcard
# #giftcard=Giftcard.create(card_type:1, card_value:value, per_off:per_discount, card_price: final_price, merchant_id: #merchant.id , gift_card_type: type1)
end
# Giftcard.where(:id =>update_all_record).update_all(:is_deleted => false)
#delete all giftcard which is not present
# giftcard_deleted = Giftcard.where(:is_deleted => true,:merchant_id => #merchant.id).destroy_all if Giftcard.where(merchant_id: #merchant.id).exists?
time2 = Time.now
puts "Current Time : " + time2.inspect
end
end
end
end
threads.each(&:join)
puts "-------"
puts threads
# merchant_deleted = Merchant.where(:is_deleted => true).destroy_all if Merchant.exists?
merchant_deleted = Merchant.where('id NOT IN (?)',update_all_merchant_record).destroy_all if Merchant.exists?
giftcard_deleted = Giftcard.where('row_id NOT IN (?)',update_all_giftcard_record).destroy_all if Giftcard.exists?
end
end
Error i am receiving:
ActiveRecord::ConnectionTimeoutError: could not obtain a connection from the pool within 5.000 seconds (waited 5.001 seconds); all pooled connections were in use

Each thread requires a separate connection to your database. You need to increase the connection pool size that your application can use in your database.yml file.
But your database should also be capable of handling the incoming connections. If you are using mysql you can check this by running select ##MAX_CONNECTIONS on your console.

Related

Nokogiri scraping with Ruby On Rails not working as expected

I'm completely new to Ruby on Rails but I think I might be missing something obvious. I'm currently working on a webapp that scrapes auction websites. The bones of the app was created by someone else. I'm currently trying to add new website scrapes but they don't seem to be working.
I have read through some of the Nokogiri documentation, checked that the scraped information is indeed not being written to the database (the seeded URLs that are being targeted have been when I check via the rails console) and used the chrome extension CSS Selector Tester to check that I am targeting the correct CSS selectors. The record ids are correct when I check via the rails console.
I have put what I think are the important sections of code below, but I might be missing something that I don't realise is important.
The websites I'm having issues with are Lot-art.com & Lot-Tissimo.com
Any help will be much appreciated.
Seeded URLs
Source.create(name: "Auction.fr", query_template: "https://www.auction.fr/_en/lot/search/?contexte=futures&tri=date_debut%20ASC&query={query}&page={page}")
Source.create(name: "Invaluable.co.uk", query_template: "https://www.invaluable.co.uk/search/api/search-results?keyword={query}&size=1000")
Source.create(name: "Interencheres.com", query_template: "http://www.interencheres.com/en/recherche/lot?search%5Bkeyword%5D={query}&page={page}")
Source.create(name: "Gazette-drouot.com", query_template: "http://catalogue.gazette-drouot.com/html/g/recherche.jsp?numPage={page}&filterDate=1&query={query}&npp=100")
Source.create(name: "Lot-art.com", query_template: "http://www.lot-art.com/auction-search/?form_id=lot_search_form&page=1&mq=&q={query}&ord=recent")
Source.create(name: "Lot-tissimo.com", query_template: "https://lot-tissimo.com/en/cmd=s&lwr=&ww={query}&xw=&srt=SN&wg=EUR&page={page}")
Scheduler code
require 'rufus-scheduler'
require 'nokogiri'
require 'mechanize'
require 'open-uri'
require "net/https"
s = Rufus::Scheduler.singleton
s.interval '1m' do
setting = Setting.find(1)
agent = Mechanize.new
agent.user_agent_alias = 'Windows Chrome'
agent.cookie_jar.load(File.join(Rails.root, 'tmp/cookies.yaml'))
List.all.each do |list|
number_of_new_items = 0
list.actions.each do |action|
url = action.source.query_template.gsub('{query}', action.list.query)
case action.source.id
when 1 # Auction.fr
20.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("div.list-products > ul > li").reverse.each do |item_data|
price = 0
if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
end
item = action.items.new(
title: item_data.at_css("h2").text.strip,
url: item_data.at_css("h2 a")["href"],
picture: item_data.at_css("div.image-wrap.lazy div.image img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 97 # Lot-Tissimo.com
5.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("#inhalt > .objektliste").reverse.each do |item_data|
# price = 0
# if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
# price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
# end
item = action.items.new(
title: item_data.at_css("div.objli-desc").text.strip,
url: item_data.at_css("td.objektliste-foto a")["href"],
picture: item_data.at_css("td.objektliste-foto a#lot_link img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 2 # Invaluable.co.uk
doc = JSON.parse(open(url).read)
doc["itemViewList"].reverse.each do |item_data|
puts item_data["itemView"]["photos"]
item = action.items.new(
title: item_data["itemView"]["title"],
url: "https://www.invaluable.co.uk/buy-now/" + item_data["itemView"]["title"].parameterize + "-" + item_data["itemView"]["ref"],
picture: item_data["itemView"]["photos"] != nil ? item_data["itemView"]["photos"].first["_links"]["medium"]["href"] : nil,
price: item_data["itemView"]["price"],
currency: item_data["itemView"]["currencySymbol"]
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
when 3 # Interencheres.com
# doc = Nokogiri::HTML(open(url))
5.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("div#lots_0 div.ligne_vente").reverse.each do |item_data|
price = 0
item = action.items.new(
title: item_data.at_css("div.ph_vente div.des_vente p a").text.strip,
url: "http://www.interencheres.com" + item_data.at_css("div.ph_vente div.des_vente p a")["href"],
picture: item_data.at_css("div.ph_vente div.gd_ph_vente img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 4 # Gazette-drouot.com
5.downto(1) do |page|
# doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc = agent.get(url.gsub('{page}', page.to_s))
# doc = agent.get(url)
doc.css("div#recherche_resultats div.lot_recherche").reverse.each do |item_data|
price = 0
picture = item_data.at_css("img.image_thumb_recherche") ? item_data.at_css("img.image_thumb_recherche")["src"] : nil
item = action.items.new(
title: item_data.at_css("#des_recherche").text.strip.truncate(140),
url: "http://catalogue.gazette-drouot.com/html/g/" + item_data.at_css("a.lien_under")["href"],
picture: picture,
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 69 # Lot-art.com
doc = agent.get(url)
doc.css("div.lot_list_holder").reverse.each do |item_data|
price = 0
item = action.items.new(
title: item_data.at_css("div.lot_list_body a")[0].text.strip.truncate(140),
url: item_data.at_css("div.lot_list_body")["href"],
picture: item_data.at_css("a.lot_list_thumb img") ["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
end
if number_of_new_items > 0 && setting.notifications_per_hour > setting.notifications_this_hour && setting.pushover_app_token.present? && setting.pushover_user_key.present?
url = URI.parse("https://api.pushover.net/1/messages.json")
req = Net::HTTP::Post.new(url.path)
req.set_form_data({
:token => setting.pushover_app_token,
:user => setting.pushover_user_key,
:message => "#{number_of_new_items} new items on #{list.name}!",
:url_title => "Check the list",
:url => "http://spottheauction.com/lists/#{list.id}"
})
res = Net::HTTP.new(url.host, url.port)
res.use_ssl = true
res.verify_mode = OpenSSL::SSL::VERIFY_PEER
res.start {|http| http.request(req) }
end
end
agent.cookie_jar.save(File.join(Rails.root, 'tmp/cookies.yaml'))
end
s.cron '0 * * * *' do
setting = Setting.find(1)
setting.notifications_this_hour = 0
setting.save
end
new just initializes an instance but doesn't save the instance. Do you actually call save somewhere?
You have two options:
Call save on the item:
item = action.items.new(
# ...
)
item.save
Or use create instead of new:
item = action.items.create(
# ...
)
In case someone else comes across this. I got the scraping of lot-art.com to work. It seemed that I was lacking specificity in the css selector for nokogiri to pull the correct data.
I am still having continuing issues with lot-tissimo although that appears to be from something else as other scrapers have issues such as scraping-hub's portia spiders.

rufus gem how to keep one job running while stopping the other

This is my scheduler.rb in my initializer file
unless defined?(Rails::Console) || File.split($0).last == 'rake'
s = Rufus::Scheduler.singleton
s.every '1m', :tag => 'main_process' do
Rails.logger.info "hello, it's #{Time.now}"
Rails.logger.flush
Bid.all.each do |bid|
id = bid.event_id
puts "*" * 50
puts bid.id
puts bid.event_id
puts "*" * 50
# #price = BuyNowBid.find_by(bid_id: params[:bid_id])
#events = Unirest.get("https://api.seatgeek.com/2/events/#{id}?&client_id=NjQwNTEzMXwxNDgxNDkxODI1").body
if #events["stats"]
#low = #events["stats"]["lowest_price"] || 0
#avg = #events["stats"]["average_price"] || 0
BuyNowBid.create(bid_id: bid.id, lowest_price: #low , average_price: #avg)
if #low <= bid.bid
send_message("+13125501444", "Lowest price matched! Buy your ticket now!")
bid.bid = 0
bid.save
end
else
puts 'problem with #events?'
p #events
end
end
end
end
def send_message(phone_number, alert_message)
account_sid = ""
auth_token = ""
#client = Twilio::REST::Client.new account_sid, auth_token
#twilio_number = ""
message = #client.messages.create(
:from => #twilio_number,
:to => phone_number,
:body => alert_message
)
puts message.to
end
so I'm running a job to pull the lowest price every minute from an api and when the lowest price matches a bid someones placed they receive a text notification, which works although, the problem I'm having is i want the job to keep running where ever minute I'm pulling the lowest price from api, but i don't want the user to keep getting the same text notification every minute.
Right now I have it so this doesn't happen but after a bid is matched it is essentially deleted from the database. so basically I'm asking how can I keep scraping the api every minute for the lowest price bid match but only send one text to the user notifying them of the bid match and to not have to delete that bid from the database.
I was really over thinking this i just created a new column called saved bid on my bid table and set that equal to bid.bid before it became zero
#events = Unirest.get("https://api.seatgeek.com/2/events/#{id}?&client_id=NjQwNTEzMXwxNDgxNDkxODI1").body
if #events["stats"]
#low = #events["stats"]["lowest_price"] || 0
#avg = #events["stats"]["average_price"] || 0
BuyNowBid.create(bid_id: bid.id, lowest_price: #low , average_price: #avg)
if #low <= bid.bid
send_message("+13125501444", "Lowest price matched for #{#events["title"]} ! Buy your ticket now for #{bid.saved_bid}!")
**bid.saved_bid = bid.bid**
bid.bid = 0
bid.save

Screen Scraping with nokogiri

I am a full stack ruby developer.I am trying to scrape to the data from the website and i am successfully able to get the data.But the problem is that next time when i fetched the data i just want to fetch only new data the i don't want to overwrite all the the data in the database.
I just want to add new record which added recently.But i am not able to find any solution for that how to do it with minimum queries and minimum code.
Here is my code which i am using for scrapping:
client = Mechanize.new
index_page = client.get('https://www.google.com/')
document_page_index = Nokogiri::HTML::Document.parse(index_page.body)
page_no_merchant = document_page_index.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
1.upto(page_no_merchant) do |page_number|
client.get("https://www.google.com/buy-gift-cards?page=#{page_number}") do |page|
document = Nokogiri::HTML::Document.parse(page.body)
document.css('.product-source').each do |item|
merchant_name= item.children.css('.name').text.gsub("Gift Cards", "")
puts merchant_name
href = item.css('a').first.attr('href')
puts href
image_url=item.children.css('.img img').attr('data-src').text.strip
puts image_url
image_url=URI.parse(image_url)
#merchant=Merchant.create!(name: merchant_name , image_url:image_url)
first_page = client.get("https://www.google.com#{href}")
document_page = Nokogiri::HTML::Document.parse(first_page.body)
page_no = document_page.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
1.upto(page_no) do |page_number_giftcard|
type1=[]
card_page = client.get("https://www.google.com#{href}?page=#{page_number_giftcard}")
document_page = Nokogiri::HTML::Document.parse(card_page.body)
document_page.xpath('//table/tbody/tr[#class="toggle-details"]').collect do |row|
row.at("td[2] ul").children.each do |typeli|
type = typeli.text.strip if typeli.text.strip.length != 0
type1 << type if typeli.text.strip.length != 0
end
value = row.at('td[3]').text.strip
value = value.to_s.tr('$', '').to_f
puts value
per_discount = row.at('td[4]').text.strip
per_discount = per_discount.to_s.tr('%', '').to_f
puts per_discount
final_price = row.at('td[5] strong').text.strip
final_price = final_price.to_s.tr('$', '').to_f
puts final_price
puts '******************************'
#giftcard=Giftcard.create(card_type:1, card_value:value, per_off:per_discount, card_price: final_price, merchant_id: #merchant.id)
end
#giftcard.update_attribute()
end
end
end
end
Thank you in advance.
Basically you are saving all data, by doing this.
#merchant=Merchant.create!(name: merchant_name , image_url:image_url)
You can try something like find_or_create_by.
#merchant=Merchant.find_or_create_by(name: merchant_name , image_url:image_url)
http://apidock.com/rails/v4.0.2/ActiveRecord/Relation/first_or_create
http://apidock.com/rails/v4.0.2/ActiveRecord/Relation/find_or_create_by

Ruby on rails rake task not returning up-to-date model info

While building a billing system I encountered the following problem:
In scheduler.rake :
#users.each do |user|
begin
puts "CALCULATING COSTS =========="
costs = user.total_billable_price_per_month(Time.now)
lead_count = user.total_billable_leads_count(Time.now)
click_count = user.total_billable_clicks_count(Time.now)
puts "CREATING NEW INVOICES ======="
#invoice = user.invoices.new
# if user.free_credits
# invoice.total_price_in_cents = 0
# else
# invoice.total_price_in_cents = costs.cents
# end
#invoice.total_leads = lead_count
#invoice.total_clicks = click_count
#invoice.total_price_in_cents = costs.cents
# Als de vorige factuur onder de 10 euro was, dan geld bij huidige factuur optellen
if user.invoices.last && user.invoices.last.total_price_in_cents < 1000
puts "Bedrag onder 10 euro"
puts "Last invoice = #{user.invoices.last.total_price_in_cents}"
#invoice.total_price_in_cents += user.invoices.last.total_price_in_cents
end
puts "SAVING INVOICE FOR #{user.name} with ID = #{user.id}"
# Factuur saven
#invoice.save
#Als de factuur hoger of gelijk is als 10euro, dan factuur aanmaken
if #invoice.total_price_in_cents >= 1000
#Moneybird factuur versturen
puts "COSTS ARE HIGHER THAN 10 EURO, CREATING MONEYBIRD INVOICE"
moneybird_invoice = MoneybirdInvoice.new
sleep(15)
moneybird_invoice.contact_id = MoneybirdContact.find_by_customer_id(user.id).id
sleep(15)
moneybird_invoice.details_attributes = [
{ :description => "Aantal leads", :amount => lead_count, :price => user.lead_price.cents.to_f/100},
{ :description => "Aantal clicks", :amount => click_count, :price => user.click_price.cents.to_f/100}
]
puts "TRYING TO SAVE MONEYBIRD INVOICE"
if moneybird_invoice.save && moneybird_invoice.put(:send_invoice)
puts "SUCCESFULLY SAVED INVOICE"
#GET UPDATED PAY URL
#sent_mb_invoice = MoneybirdInvoice.get(moneybird_invoice.id)
sleep(15)
#invoice.update_attributes(:moneybird_invoice_id => #sent_mb_invoice['invoice_id'], :moneybird_pay_url => #sent_mb_invoice['pay_url'])
else
puts "MONEYBIRD INVOICE FAILED"
puts moneybird_invoice.errors.inspect
end
else
# GEEN MONEYBIRD FACTUUR AANMAKEN
end
rescue
puts "ER IS IETS FOUT GEGAAN MET FACTUREREN USER ID = #{user.id} & NAME = #{user.name}"
puts #invoice.errors.inspect
end
end
This piece of code should constantly increment each time the rake task is run, except when the total amount reaches > 1000.
if user.invoices.last && user.invoices.last.total_price_in_cents < 1000
puts "Bedrag onder 10 euro"
puts "Last invoice = #{user.invoices.last.total_price_in_cents}"
#invoice.total_price_in_cents += user.invoices.last.total_price_in_cents
end
The code above always puts "Last invoice = 100" => This should increment each time the rake tasks is run
Every new invoice still has the same total_price_in_cents (when I'm expecting that it should increment).
What is going on ?
EDIT: Added after code upadte:
In your updated code, it looks like you were calling user.invoices.last after you called user.invoices.new, this is why it always returned the same value.
Create a variable #last_invoice = user.invoices.last before call user.invoices.new.
ORIGINAL ANSWER:
In your original code posting, it looks like your save call on #invoice happened outside the loop -- I believe you're only saving it once.
task :create_invoices => :environment do
# Begin the loop for each use
User.all.each do |user|
#invoice = user.invoices.build
#If last bill < 1000
if user.invoices.last && user.invoices.last.total_price_in_cents < 1000
puts "Last invoice = #{user.invoices.last.total_price_in_cents}"
#invoice.total_price_in_cents += user.invoices.last.total_price_in_cents
#invoice.save
end # end 'if' statement
end # end loop for all users
end # end task definition
So you loop though the users table, but never save updates -- except for the very last time after you exit the loop

Gem Resque Error - Undefined "method perform" after Overriding it form the super class

First of all Thanks for you all for helping programmers like me with your valuable inputs in solving day to day issues.
This is my first question in stack overflow as I am experiencing this problems from almost one week.
WE are building a crawler which crawls the specific websites and extract the contents from it, we are using mechanize to acheive this , as it was taking loads of time we decided to run the crawling process as a background task using resque with redis gem , but while sending the process to background I am experiencing the error as the title saying,
my code in lib/parsers/home.rb
require 'resque'
require File.dirname(__FILE__)+"/../index"
class Home < Index
Resque.enqueue(Index , :page )
def self.perform(page)
super (page)
search_form = page.form_with :name=>"frmAgent"
resuts_page = search_form.submit
total_entries = resuts_page.parser.xpath('//*[#id="PagingTable"]/tr[2]/td[2]').text
if total_entries =~ /(\d+)\s*$/
total_entries = $1
else
total_entries = "unknown"
end
start_res_idx = 1
while true
puts "Found #{total_entries} entries"
detail_links = resuts_page.parser.xpath('//*[#id="MainTable"]/tr/td/a')
detail_links.each do |d_link|
if d_link.attribute("class")
next
else
data_page = #agent.get d_link.attribute("href")
fields = get_fields_from_page data_page
save_result_page page.uri.to_s, fields
#break
end
end
site_done
rescue Exception => e
puts "error: #{e}"
end
end
and the superclass in lib/index.rb is
require 'resque'
require 'mechanize'
require 'mechanize/form'
class Index
#queue = :Index_queue
def initialize(site)
#site = site
#agent = Mechanize.new
#agent.user_agent = Mechanize::AGENT_ALIASES['Windows Mozilla']
#agent.follow_meta_refresh = true
#rows_parsed = 0
#rows_total = 0
rescue Exception => e
log "Unable to login: #{e.message}"
end
def run
log "Parsing..."
url = "unknown"
if #site.url
url = #site.url
log "Opening #{url} as a data page"
#page = #agent.get(url)
#perform method should be override in subclasses
#data = self.perform(#page)
else
#some sites do not have "datapage" URL
#for example after login you're already on your very own datapage
#this is to be addressed in 'perform' method of subclass
#data = self.perform(nil)
end
rescue Exception=>e
puts "Failed to parse URL '#{url}', exception=>"+e.message
set_site_status("error "+e.message)
end
#overriding method
def self.perform(page)
end
def save_result_page(url, result_params)
result = Result.find_by_sql(["select * from results where site_id = ? AND ref_code = ?", #site.id, utf8(result_params[:ref_code])]).first
if result.nil?
result_params[:site_id] = #site.id
result_params[:time_crawled] = DateTime.now().strftime "%Y-%m-%d %H:%M:%S"
result_params[:link] = url
result = Result.create result_params
else
result.result_fields.each do |f|
f.delete
end
result.link = url
result.time_crawled = DateTime.now().strftime "%Y-%m-%d %H:%M:%S"
result.html = result_params[:html]
fields = []
result_params[:result_fields_attributes].each do |f|
fields.push ResultField.new(f)
end
result.result_fields = fields
result.save
end
#rows_parsed +=1
msg = "Saved #{#rows_parsed}"
msg +=" of #{#rows_total}" if #rows_total.to_i > 0
log msg
return result
end
end
What's Wrong with this code?
Thanks

Resources