Screen Scraping with nokogiri - ruby-on-rails

I am a full stack ruby developer.I am trying to scrape to the data from the website and i am successfully able to get the data.But the problem is that next time when i fetched the data i just want to fetch only new data the i don't want to overwrite all the the data in the database.
I just want to add new record which added recently.But i am not able to find any solution for that how to do it with minimum queries and minimum code.
Here is my code which i am using for scrapping:
client = Mechanize.new
index_page = client.get('https://www.google.com/')
document_page_index = Nokogiri::HTML::Document.parse(index_page.body)
page_no_merchant = document_page_index.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
1.upto(page_no_merchant) do |page_number|
client.get("https://www.google.com/buy-gift-cards?page=#{page_number}") do |page|
document = Nokogiri::HTML::Document.parse(page.body)
document.css('.product-source').each do |item|
merchant_name= item.children.css('.name').text.gsub("Gift Cards", "")
puts merchant_name
href = item.css('a').first.attr('href')
puts href
image_url=item.children.css('.img img').attr('data-src').text.strip
puts image_url
image_url=URI.parse(image_url)
#merchant=Merchant.create!(name: merchant_name , image_url:image_url)
first_page = client.get("https://www.google.com#{href}")
document_page = Nokogiri::HTML::Document.parse(first_page.body)
page_no = document_page.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
1.upto(page_no) do |page_number_giftcard|
type1=[]
card_page = client.get("https://www.google.com#{href}?page=#{page_number_giftcard}")
document_page = Nokogiri::HTML::Document.parse(card_page.body)
document_page.xpath('//table/tbody/tr[#class="toggle-details"]').collect do |row|
row.at("td[2] ul").children.each do |typeli|
type = typeli.text.strip if typeli.text.strip.length != 0
type1 << type if typeli.text.strip.length != 0
end
value = row.at('td[3]').text.strip
value = value.to_s.tr('$', '').to_f
puts value
per_discount = row.at('td[4]').text.strip
per_discount = per_discount.to_s.tr('%', '').to_f
puts per_discount
final_price = row.at('td[5] strong').text.strip
final_price = final_price.to_s.tr('$', '').to_f
puts final_price
puts '******************************'
#giftcard=Giftcard.create(card_type:1, card_value:value, per_off:per_discount, card_price: final_price, merchant_id: #merchant.id)
end
#giftcard.update_attribute()
end
end
end
end
Thank you in advance.

Basically you are saving all data, by doing this.
#merchant=Merchant.create!(name: merchant_name , image_url:image_url)
You can try something like find_or_create_by.
#merchant=Merchant.find_or_create_by(name: merchant_name , image_url:image_url)
http://apidock.com/rails/v4.0.2/ActiveRecord/Relation/first_or_create
http://apidock.com/rails/v4.0.2/ActiveRecord/Relation/find_or_create_by

Related

How do I display database calls from Controller into view, to be viewed in HTML

I was handed a project from someone else, it's in Ruby On Rails, which I know VERY LITTLE. Basically, there is an EXPORT button, that the user clicks to send data to a CSV. I am tasked with sending this data to the view to be seen in HTML. (Thinking I could use dataTables). I have tried following examples, such as:
#example = StudentGroup.where(survey_id: #survey.id).order("groupNum")
and then using <%= #example %> in the view just to see the data and I get nothing. (Also extremely new to MySQL). I'll post the method, if ANYONE can help me, I'd very much appreciate it.
def download_results
if (user_signed_in?)
else
redirect_to new_user_session_path
end
#survey = Survey.find(params[:survey_to_view])
filename = #survey.name + " - " + Date.today.to_formatted_s(:short)
require "csv"
CSV.open(#survey.name+".csv", "wb") do |csv|
csv << [filename]
StudentGroup.where(survey_id: #survey.id).order("groupNum")
csv << []
csv << ["Summarized Results"]
csv << ["UCA","Group Number","Criteria 1","Criteria 2","Criteria 3","Criteria 4","Criteria 5","Criteria 6","Criteria 7","Criteria 8","Overall Team Contribution","Average(Would Work With Again)","Average(C1..C8)","Overall Team Contribution MINUS Average(C1..C9)"]
questions = #survey.questions
numQuestions = 0
questions.each do |q|
if(q.question_type != 2 && q.question_type != 4)
numQuestions = numQuestions+1
end
end
groups.each do |g|
answersCount = Answer.where(student_group_id: g.id).count
if(answersCount == numQuestions && answersCount != 0)
othersInGroup = StudentGroup.where(groupNum: g.groupNum, survey_id: #survey.id).order("groupNum")
size = othersInGroup.count-1
arr = []
criteria = SurveyQuestionDatum.where("number > 24 AND number < 35")
multiAvg = 0
teamCont = 0
criteria.each do |c|
avg = 0
othersInGroup.each do |o|
a = Answer.where(survey_question_datum_id: c.id, student_group_id: o.id).first
if(o.uca != g.uca)
if(a.nil?)
size = size-1
else
avg = avg + a.answer[g.uca].to_i
end
end
end
avg = avg.to_f/size
if(c.number == 33)
teamCont = avg
end
if(c.number < 33)
multiAvg = multiAvg+avg
end
arr << avg
end
multiAvg = multiAvg.to_f/8
arr << multiAvg
arr << teamCont-multiAvg
arr.insert(0,g.uca, g.groupNum)
csv << arr
end
end
csv << []
csv << []
csv << ["Raw Student Answers"]
groups = StudentGroup.where(survey_id: #survey.id).order("groupNum")
size = groups.count
csv << ["UCA", "F-Number", "Group Number"]
groups.each do |g|
answersCount = Answer.where(student_group_id: g.id).count
if(answersCount == numQuestions && answersCount != 0)
othersInGroup = StudentGroup.where(groupNum: g.groupNum, survey_id: #survey.id).order("groupNum")
csv << []
csv << [g.uca, g.FNum, g.groupNum]
answers = Answer.where(student_group_id: g.id)
csv << ["Question Number", "Question", "Answer"]
answers.each do |a|
datum = a.survey_question_datum
question = datum.question
#question_types = {"0" => "short", "1" => "paragraph",
#2" => "title", "3" => "fivept", "4" => "fixed",
#5" =>"ranking", "6"=>"tenpoints","7"=>"hundredpoints"}
ansText = ""
if(question.question_type == 0)
ansText = a.answer
elsif (question.question_type == 1)
if(question.rule == 'perMember')
othersInGroup.each do |o|
ansText = ansText+"#{o.uca},#{a.answer[o.uca]},"
end
elsif(question.rule == 'default')
ansText = a.answer
end
else (question.question_type == 3)
othersInGroup.each do |o|
ansText = ansText+"#{o.uca},#{a.answer[o.uca]},"
end
end
ansText = ansText.chomp(',')
ansText = ansText.split(',')
ansText.insert(0,datum.number,question.question_text)
csv << ansText
end
end
end
end
send_file(#survey.name+".csv", :filename => filename+".csv")
end
You need a new controller action. Take a look at http://guides.rubyonrails.org/layouts_and_rendering.html
Create an index (or show, or whatever you want to call it, maybe example) action. Make sure it is in your routes.
http://guides.rubyonrails.org/getting_started.html#adding-a-route-for-comments
do not use the download_results code.
set your #example variable the way you were trying to do.
create a view for your index action
add the data to your index view.
If you put code in your download_results method (action) it will never get rendered because of the send_file method call.
Did you create a brand new controller / action / view? Did you use generators? Have you really practiced doing this setup exactly the way the examples, videos, tutorials say to do it? If you have, you have seen how all the pieces (models, controllers, actions, views) come together. You should have seen how render statements come into play. Do that, exactly as the tutorials say to do it and you will get the idea.
If you want to use the same content that the download action uses, refactor the code to extract a method that is used both actions.
This is related to respond_to part, check the docs.
send_file(#survey.name+".csv", :filename => filename+".csv")
Your code above simply means you click the button, the controller will respond you with a csv file. So, if you want a html, the controller should be able to respond to html as well.

How to put validation on form in which button is not used in Ruby On Rails?

I have three fields start date , end date and , issue tracker group if I select only group then all records of that group should come from issue request table but when I select date then it should display specific date records .
Controller Code -
def group_report_list
#start = params[:date].to_date
#en = params[:to_date].to_date
#issue_tracker_group = IssueTrackerGroup.find(params[:id])
#issue_requests = IssueRequest.where(issue_tracker_group_id: #issue_tracker_group.id,date: #start..#en)
one way is using normal if..else condition.
def group_report_list
#start = params[:date].to_date unless params[:date].nil?
#en = params[:to_date].to_date unless params[:to_date].nil?
#issue_tracker_group = IssueTrackerGroup.find(params[:id])
unless #start.nil? or #en.nil?
#issue_requests = IssueRequest.where(issue_tracker_group_id: #issue_tracker_group.id,date: #start..#en)
else
#issue_requests = IssueRequest.where(issue_tracker_group_id: #issue_tracker_group.id)
end
end
if you feel this is too lengthy code, you can use the code below
def group_report_list
#start = params[:date]
#en = params[:to_date]
#issue_tracker_group = IssueTrackerGroup.find(params[:id])
condition = "issue_tracker_group_id = #{#issue_tracker_group.id}"
condition += " AND date BETWEEN '#{#start.to_date}' AND '#{#en.to_date}' " unless #start.nil? or #en.nil?
#issue_requests = IssueRequest.where(condition)
end

Scraping data in rails using thread

I am doing scraping to fetch the data from the website to my database in rails.I am fetching the 32000 record with this script there isn't any issue but i want to fetch the data faster so i apply the thread in my rake task but then there is a issue while running the rake task some of the data is fetching then the rake task getting aborted.
I am not aware of what to do task if any help can be done i am really grateful . Here is my rake task code for the scraping.
task scratch_to_database: :environment do
time2 = Time.now
puts "Current Time : " + time2.inspect
client = Mechanize.new
giftcard_types=Giftcard.card_types
find_all_merchant=Merchant.all.pluck(:id, :name).to_h
#first index page of the merchant
index_page = client.get('https://www.twitter.com//')
document_page_index = Nokogiri::HTML::Document.parse(index_page.body)
#set all merchant is deteled true
# set_merchant_as_deleted = Merchant.update_all(is_deleted: true) if Merchant.exists?
# set_giftcard_as_deleted = Giftcard.update_all(is_deleted: true) if Giftcard.exists?
update_all_merchant_record = []
update_all_giftcard_record = []
threads = []
#Merchant inner page pagination loop
page_no_merchant = document_page_index.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
1.upto(page_no_merchant) do |page_number|
threads << Thread.new do
client.get("https://www.twitter.com/buy-gift-cards?page=#{page_number}") do |page|
document = Nokogiri::HTML::Document.parse(page.body)
#Generate the name of the merchant and image of the merchant loop
document.css('.product-source').each do |item|
merchant_name= item.children.css('.name').text.gsub("Gift Cards", "")
href = item.css('a').first.attr('href')
image_url=item.children.css('.img img').attr('data-src').text.strip
#image url to parse the url of the image
image_url=URI.parse(image_url)
#saving the record of the merchant
# #merchant=Merchant.create(name: merchant_name , image_url:image_url)
if find_all_merchant.has_value?(merchant_name)
puts "this if"
merchant_id=find_all_merchant.key(merchant_name)
puts merchant_id
else
#merchant= Merchant.create(name: merchant_name , image_url:image_url)
update_all_merchant_record << #merchant.id
merchant_id=#merchant.id
end
# #merchant.update_attribute(:is_deleted, false)
#set all giftcard is deteled true
# set_giftcard_as_deleted = Giftcard.where(merchant_id: #merchant.id).update_all(is_deleted: true) if Giftcard.where(merchant_id: #merchant.id).exists?
#first page of the giftcard details page
first_page = client.get("https://www.twitter.com#{href}")
document_page = Nokogiri::HTML::Document.parse(first_page.body)
page_no = document_page.css('.pagination.pagination-centered ul li:nth-last-child(2) a').text.to_i
hrefextra =document_page.css('.dropdown-menu li a').last.attr('href')
#generate the giftcard details loop with the pagination
# update_all_record = []
find_all_giftcard=Giftcard.where(merchant_id:merchant_id).pluck(:row_id)
puts merchant_name
# puts find_all_giftcard.inspect
card_page = client.get("https://www.twitter.com#{hrefextra}")
document_page = Nokogiri::HTML::Document.parse(card_page.body)
#table details to generate the details of the giftcard with price ,per_off and final value of the giftcard
document_page.xpath('//table/tbody/tr[#class="toggle-details"]').collect do |row|
type1=[]
row_id = row.attr("id").to_i
row.at("td[2] ul").children.each do |typeli|
type = typeli.text.strip if typeli.text.strip.length != 0
type1 << type if typeli.text.strip.length != 0
end
value = row.at('td[3]').text.strip
value = value.to_s.tr('$', '').to_f
per_discount = row.at('td[4]').text.strip
per_discount = per_discount.to_s.tr('%', '').to_f
final_price = row.at('td[5] strong').text.strip
final_price = final_price.to_s.tr('$', '').to_f
type1.each do |type|
if find_all_giftcard.include?(row_id)
update_all_giftcard_record<<row_id
puts "exists"
else
puts "new"
#giftcard= Giftcard.create(card_type: giftcard_types.values_at(type.to_sym)[0], card_value:value, per_off:per_discount, card_price: final_price, merchant_id: merchant_id , row_id: row_id )
update_all_giftcard_record << #giftcard.row_id
end
end
#saving the record of the giftcard
# #giftcard=Giftcard.create(card_type:1, card_value:value, per_off:per_discount, card_price: final_price, merchant_id: #merchant.id , gift_card_type: type1)
end
# Giftcard.where(:id =>update_all_record).update_all(:is_deleted => false)
#delete all giftcard which is not present
# giftcard_deleted = Giftcard.where(:is_deleted => true,:merchant_id => #merchant.id).destroy_all if Giftcard.where(merchant_id: #merchant.id).exists?
time2 = Time.now
puts "Current Time : " + time2.inspect
end
end
end
end
threads.each(&:join)
puts "-------"
puts threads
# merchant_deleted = Merchant.where(:is_deleted => true).destroy_all if Merchant.exists?
merchant_deleted = Merchant.where('id NOT IN (?)',update_all_merchant_record).destroy_all if Merchant.exists?
giftcard_deleted = Giftcard.where('row_id NOT IN (?)',update_all_giftcard_record).destroy_all if Giftcard.exists?
end
end
Error i am receiving:
ActiveRecord::ConnectionTimeoutError: could not obtain a connection from the pool within 5.000 seconds (waited 5.001 seconds); all pooled connections were in use
Each thread requires a separate connection to your database. You need to increase the connection pool size that your application can use in your database.yml file.
But your database should also be capable of handling the incoming connections. If you are using mysql you can check this by running select ##MAX_CONNECTIONS on your console.

Cant found model with out an ID in rails 3.2.12

i ve this method. I m not at all able to understand the error which is
Couldn't find Company without an ID
in ActiveRecord::RecordNotFound in CustomersController#bulk_create
This method is written to create customers for a company in bulk by taking their name and numbers in format name:number.
The method is as follows:
def bulk_create
res = ""
comp_id = params[:customer][:selected_companies].delete_if{|a| a.blank?}.first
comp = Company.find(comp_id)
s = SentSmsMessage.new
s.set_defaults
s.data = tmpl("command_signup_ok", customer, comp) unless params[:customer][:email].length > 0
s.data = params[:customer][:email] if params[:customer][:email].length > 0
s.company = comp if !comp.nil?
s.save
unless comp_id.blank?
params[:customer][:name].lines.each do |line|
(name, phone) = line.split(/\t/) unless line.include?(":")
(name, phone) = line.split(":") if line.include?(":")
phone = phone.gsub("\"", "")
phone = phone.strip if phone.strip.to_i > 0
name = name.gsub("\"", "")
name = name.gsub("+", "")
phone = "47#{phone}" if params[:customer][:active].to_i == 1
customer = Customer.first(:conditions => ["phone_number = ?", phone])
if customer.nil?
customer = Customer.new
customer.name = name
# customer.email
# customer.login
# customer.password
customer.accepted_agreement = DateTime.now
customer.phone_number = phone
customer.active = true
customer.accepted_agreement = DateTime.now
customer.max_msg_week = params[:customer][:max_msg_week]
customer.max_msg_day = params[:customer][:max_msg_day]
customer.selected_companies = params[:customer][:selected_companies].delete_if{|a| a.blank?}
res += "#{name} - #{phone}: Create OK<br />" if customer.save
res += "#{name} - #{phone}: Create failed<br />" unless customer.save
else
params[:customer][:selected_companies].each do |cid|
new_company = Company.find(cid) unless cid.blank?
if !new_company.nil?
if !customer.companies.include?(new_company)
customer.companies << new_company
if customer.save
res += "#{name} - #{phone}: Customer exists and the customer was added to the firm #{new_company.name}<br />"
else
res += "#{name} - #{phone}: Customer exist, but something went wrong during storage. Check if the client is in the firm.<br />"
end
else
res += "#{name} - #{phone}: Customer exists and is already on firm #{new_company.name}<br />"
end
end
end
end
s.sms_recipients.create(:phone_number => customer.phone_number)
end
s.save
s.send_as_sms
#result = res
respond_to do |format|
format.html { render "bulk_create"}
end
else
#result = "You have not selected any firm to add these users. Press the back button and try again."
respond_to do |format|
format.html { render "bulk_create"}
end
end
end
I want to update one situation here. That when i submit the form blank then it gives this error. Also if i filled the form with the values then its show the situation which the method is returning in case of fail.
res += "#{name} - #{phone}: Create failed <br />"
The tmpl method
private
def tmpl(setting_name, customer, company = nil)
text = ""
if customer.companies.count > 0
sn = "#{setting_name}_#{#customer.companies.first.company_category.suffix}".downcase rescue setting_name
text = Setting.value_by(sn) rescue ""
end
textlenth = text.length rescue 0
if textlenth < 3
text = Setting.value_by(setting_name) rescue Setting.value_by("command_error")
end
return fill_template(text, customer, company)
end
From the model customer.rb
def selected_companies=(cmps)
cmps.delete("")
# Check the old ones. Make a note if they are not in the list. If the existing ones are not in the new list, just remove them
self.companies.each do |c|
self.offer_subscriptions.find(:first, ["customer_id = ?", c]).destroy unless cmps.include? c.id.to_s
cmps.delete c.id.to_s if cmps.include? c.id.to_s
end
# Then create the new ones
cmps.each do |c2|
cmp = Company.find(:first, ["id = ?", c2])
if cmp && !c2.blank?
offerSubs = offer_subscriptions.new
offerSubs.company_id = c2
offerSubs.save
end
end
end
def selected_companies
return self.companies.collect{|c| c.id}
end
The association of customer is as follows:
has_many :offer_subscriptions
has_many :companies, :through => :offer_subscriptions
This code is written by the some one else. I m trying to understand this method but so far not being able to understand this code.
Please help.
Thanks in advance.
You are getting 'Couldn't find Company without an ID' error because your Company table doesn't contain record with id = comp_id
Change comp = Company.find(comp_id) to comp = Company.find_by_id(comp_id).
This will return nil instead of an error.
Add comp is not nil condition is already handled in your code.
Your comp_id line is returning nil.
comp_id = params[:customer][:selected_companies].delete_if{|a| a.blank?}.first
Post the params that get passed to this function and we could hopefully find out why. In the meantime you could enclose the block in a begin - rescue block to catch these errors:
begin
<all your code>
rescue ActiveRecord::RecordNotFound
return 'Unable to find a matching record'
end
try this:
comp = ""
comp = Company.find(comp_id) unless comp_id.nil?
instead of comp = Company.find(comp_id)
further nil checking present in your code.
Reason being
params[:customer][:selected_companies].delete_if{|a| a.blank?} = []
so [].first = nil
therefor, params[:customer][:selected_companies].delete_if{|a| a.blank?}.first = nil
and comp_id is nil
So check the log file and check what is coming in the parameter "selected_companies"
when you will find the parameter, everything will be understood well....

Gem Resque Error - Undefined "method perform" after Overriding it form the super class

First of all Thanks for you all for helping programmers like me with your valuable inputs in solving day to day issues.
This is my first question in stack overflow as I am experiencing this problems from almost one week.
WE are building a crawler which crawls the specific websites and extract the contents from it, we are using mechanize to acheive this , as it was taking loads of time we decided to run the crawling process as a background task using resque with redis gem , but while sending the process to background I am experiencing the error as the title saying,
my code in lib/parsers/home.rb
require 'resque'
require File.dirname(__FILE__)+"/../index"
class Home < Index
Resque.enqueue(Index , :page )
def self.perform(page)
super (page)
search_form = page.form_with :name=>"frmAgent"
resuts_page = search_form.submit
total_entries = resuts_page.parser.xpath('//*[#id="PagingTable"]/tr[2]/td[2]').text
if total_entries =~ /(\d+)\s*$/
total_entries = $1
else
total_entries = "unknown"
end
start_res_idx = 1
while true
puts "Found #{total_entries} entries"
detail_links = resuts_page.parser.xpath('//*[#id="MainTable"]/tr/td/a')
detail_links.each do |d_link|
if d_link.attribute("class")
next
else
data_page = #agent.get d_link.attribute("href")
fields = get_fields_from_page data_page
save_result_page page.uri.to_s, fields
#break
end
end
site_done
rescue Exception => e
puts "error: #{e}"
end
end
and the superclass in lib/index.rb is
require 'resque'
require 'mechanize'
require 'mechanize/form'
class Index
#queue = :Index_queue
def initialize(site)
#site = site
#agent = Mechanize.new
#agent.user_agent = Mechanize::AGENT_ALIASES['Windows Mozilla']
#agent.follow_meta_refresh = true
#rows_parsed = 0
#rows_total = 0
rescue Exception => e
log "Unable to login: #{e.message}"
end
def run
log "Parsing..."
url = "unknown"
if #site.url
url = #site.url
log "Opening #{url} as a data page"
#page = #agent.get(url)
#perform method should be override in subclasses
#data = self.perform(#page)
else
#some sites do not have "datapage" URL
#for example after login you're already on your very own datapage
#this is to be addressed in 'perform' method of subclass
#data = self.perform(nil)
end
rescue Exception=>e
puts "Failed to parse URL '#{url}', exception=>"+e.message
set_site_status("error "+e.message)
end
#overriding method
def self.perform(page)
end
def save_result_page(url, result_params)
result = Result.find_by_sql(["select * from results where site_id = ? AND ref_code = ?", #site.id, utf8(result_params[:ref_code])]).first
if result.nil?
result_params[:site_id] = #site.id
result_params[:time_crawled] = DateTime.now().strftime "%Y-%m-%d %H:%M:%S"
result_params[:link] = url
result = Result.create result_params
else
result.result_fields.each do |f|
f.delete
end
result.link = url
result.time_crawled = DateTime.now().strftime "%Y-%m-%d %H:%M:%S"
result.html = result_params[:html]
fields = []
result_params[:result_fields_attributes].each do |f|
fields.push ResultField.new(f)
end
result.result_fields = fields
result.save
end
#rows_parsed +=1
msg = "Saved #{#rows_parsed}"
msg +=" of #{#rows_total}" if #rows_total.to_i > 0
log msg
return result
end
end
What's Wrong with this code?
Thanks

Resources