Im trying to use Kimurai to scrape a website. Im running into this error when I want to do /scrape.
def scrape
url = "https://www.tripadvisor.com/Restaurants-g31892-Rogers_Arkansas.html"
response = RestaurantsScraper.parse!(response, url, data: {})
if response[status] == :completed && response[error].nil?
flash.now[notice] = "Successfully scraped url"
else
flash.now[alert] = response[error]
end
end
Here is my scraper class
class RestaurantsScraper < Kimurai::Base
#name = "restaurants_scraper"
#driver = :selenium_chrome
#start_urls = ["https://www.tripadvisor.com/Restaurants-g31892-Rogers_Arkansas.html"]
def parse(response, url:, data: {})
response.xpath("//div[#class=_1llCuDZj]").each do |a|
request_to :parse_repo_page, url: absolute_url(a[:href], base: url)
end
end
def parse_repo_page(response, url:, data: {})
item = {}
item["title"] = t.css('a._15_ydu6b')&.text&.squish&.gsub('[^0-9].', '')
item["type"] = t.css('span._1p0FLy4t')&.text&.squish
item["reviews"] = t.css('span.w726Ki5B').text&.squish
item["top_reviews"] = t.css('a._2uEVo25r _3mPt7dFq').text&.squish
Restaurant.where(item).first_or_create
end
end
Here is the error im getting
It's because response from RestaurantsScraper.parse!(response, url, data: {}) isn't defined.
From the kimurai docs it says you need to pass a Nokogiri::HTML::Document object.
I haven't used Kimurai and it feels like there is definitely a better way to do this, but something like the following may be enough to get you to the next step:
def scrape
require 'open-uri'
url = "https://www.tripadvisor.com/Restaurants-g31892-Rogers_Arkansas.html"
html = Nokogiri.parse open(url)
response = RestaurantsScraper.parse!(html, url, data: {})
if response[status] == :completed && response[error].nil?
flash.now[notice] = "Successfully scraped url"
else
flash.now[alert] = response[error]
end
end
Related
I have a rails worker using redis/sidekiq where I send some data to an API (Active Campaign), so I normally use all the http configurations to send data. I want to have it nice and clean, so it's part of a refactor thing. My worker currently looks like this:
class UpdateLeadIdWorker
include Sidekiq::Worker
BASE_URL = Rails.application.credentials.dig(:active_campaign, :url)
private_constant :BASE_URL
API_KEY = Rails.application.credentials.dig(:active_campaign, :key)
private_constant :API_KEY
def perform(ac_id, current_user_id)
lead = Lead.where(user_id: current_user_id).last
url = URI("#{BASE_URL}/api/3/contacts/#{ac_id}") #<--- need this endpoint
https = bindable_lead_client.assign(url)
pr = post_request.assign(url)
case lead.quote_type
when 'renter'
data = { contact: { fieldValues: [{ field: '5', value: lead.lead_id }] } }
when 'home'
data = { contact: { fieldValues: [{ field: '4', value: lead.lead_id }] } }
when 'auto'
data = { contact: { fieldValues: [{ field: '3', value: lead.lead_id }] } }
else
raise 'Invalid quote type'
end
pr.body = JSON.dump(data)
response = JSON.parse(https.request(pr).read_body).symbolize_keys
if response.code == '200'
Rails.logger.info "Successfully updated contact #{ac_id} with lead id #{lead.lead_id}"
else
raise "Error creating contact: #{response.body}"
end
end
def bindable_lead_client
http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true
http
end
def post_request
post_request_ = Net::HTTP::Put.new(url)
post_request_['Accept'] = 'application/json'
post_request_['Content-Type'] = 'application/json'
post_request_['api-token'] = API_KEY
post_request_
end
end
But whenever I run this I get:
2022-07-28T00:52:08.683Z pid=24178 tid=1s1u WARN: NameError: undefined local variable or method `url' for #<UpdateLeadIdWorker:0x00007fc713442be0 #jid="e2b9ddb6d5f4b8aecffa4d8b">
Did you mean? URI
I don't want everything stuck in one method. How could I achieve to make this cleaner?
Thanks.
Pure ruby wise, The reason you get the error is because your method definition bindable_lead_client is missing the url argument. Hence undefined variable.
So def should look something like:
def bindable_lead_client (url)
http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true
http
end
and call:
bindable_lead_client(url)
As for how to make this code better, falls under question being too subjective under StackOverflow guidelines, which encourage you to ask more specific questions.
So I am writing a simple controller that will receive parameters from a Postrequest to my API. And I want to keep things cleaner and nice, so I wrote something like this:
def create
update_contact
end
def update_contact
create_token
url = URI("https://acme.api-us1.com/api/3/contacts/#{active_campaign_id}")
http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true
request = Net::HTTP::Put.new(url)
request['Accept'] = 'application/json'
request['Content-Type'] = 'application/json'
request['api-token'] = API_KEY
data = { contact: { fieldValues: [{ field: '1', value: contact[:email_token] }] } }
request.body = JSON.dump(data)
response = http.request(request)
end
def create_token
active_campaign_id = params[:contact][:id].to_i
generate_token = SecureRandom.urlsafe_base64(12)
contact = Contact.find_or_initialize_by(active_campaign_id: active_campaign_id, email_token: generate_token)
contact.save!
end
But whenever I run this turns into:
*** NameError Exception: undefined local variable or method `active_campaign_id'
and same goes for email_token
*** NameError Exception: undefined local variable or method `email_token'
Now whenever I do this:
def create
update_contact
end
def update_contact
active_campaign_id = params[:contact][:id].to_i
generate_token = SecureRandom.urlsafe_base64(12)
contact = Contact.find_or_initialize_by(active_campaign_id: active_campaign_id, email_token: generate_token)
contact.save!
url = URI("https://acme.api-us1.com/api/3/contacts/#{active_campaign_id}")
http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true
request = Net::HTTP::Put.new(url)
request['Accept'] = 'application/json'
request['Content-Type'] = 'application/json'
request['api-token'] = API_KEY
data = { contact: { fieldValues: [{ field: '1', value: contact[:email_token] }] } }
request.body = JSON.dump(data)
response = http.request(request)
end
It works! Why is that? How can I structure my code or methods as clean as possible?
And what resources could make me understand more accessing methods in rails?
Thanks for the help!
If you want to keep methods you have and make it works you can achieve this by doing next refactoring:
def create
update_contact
end
def update_contact
contact = create_contact
url = URI("https://acme.api-us1.com/api/3/contacts/#{contact.active_campaign_id}")
http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true
request = Net::HTTP::Put.new(url)
request['Accept'] = 'application/json'
request['Content-Type'] = 'application/json'
request['api-token'] = API_KEY
data = { contact: { fieldValues: [{ field: '1', value: contact.email_token }] } }
request.body = JSON.dump(data)
response = http.request(request)
end
def create_contact
Contact.create_with(
email_token: SecureRandom.urlsafe_base64(12)
).find_or_create_by!(
active_campaign_id: params.dig(:contact, :id)&.to_i
)
end
And probably you need to use create_with method because every time when you will try to find Contact by fields pair email_token and active_campaign_id SecureRandom.urlsafe_base64(12) will generate a new email token and you always will have new object created instead of getting it from database.
It looks like you are trying to access the 'active_campaign_id' and 'email_token' variables in the 'update_contact' method, but those variables are only defined in the 'create_token' method. Try moving the 'update_contact' method inside the 'create_token' method so that it has access to those variables.
More info you could find here:
https://guides.rubyonrails.org/v6.1/action_view_overview.html#using-action-view-with-rails
https://guides.rubyonrails.org/v6.1/engines.html#using-a-controller-provided-by-the-application
https://guides.rubyonrails.org/v6.1/security.html#user-management
https://guides.rubyonrails.org/v6.1/2_3_release_notes.html#action-controller
https://guides.rubyonrails.org/v6.1/active_record_multiple_databases.html#automatic-swapping-for-horizontal-sharding
I want to redirect another page or give alert messages due to user table info in this controller.
but, I get bellow error message.
like this:
class Ir::FactsetUrlsController < Ir::ApplicationController
def show
user = User.find(params[:user_id])
factset_url, option = get_factset_url(user)
redirect_to factset_url, option
end
private
def get_factset_url(investor)
url, option = if !current_user.factset_enabled?
url, {alert: "こちらの閲覧には有料契約が必要です。"}
elsif investor.company.is_fresh?
url, {alert: "現在収録作業中です。申し訳ございませんが、少々お待ち下さい。"}
elsif investor.company.is_fresh?
url, {alert: "申し訳ございませんが、投資家DBに収録がありません 収録されていない理由 ①投資助言会社や投資アドバイザーなど直接保有していない ②HPや開示されている情報がない"}
else
investor.company.factset_url
end
end
end
Simply wrap in square brackets [ and ] within conditions as multiple values are being assigned.
def get_factset_url(investor)
url, option = if !current_user.factset_enabled?
[url, {alert: "こちらの閲覧には有料契約が必要です。"}]
elsif investor.company.is_fresh?
[url, {alert: "現在収録作業中です。申し訳ございませんが、少々お待ち下さい。"}]
elsif investor.company.is_fresh?
[url, {alert: "申し訳ございませんが、投資家DBに収録がありません 収録されていない理由 ①投資助言会社や投資アドバイザーなど直接保有していない ②HPや開示されている情報がない"}]
else
[investor.company.factset_url, nil]
end
end
I'm new to Rails and I'm trying to make a simple weather API to get weather by zipcode
is there a way to get the zipcode from user input from a simple form, this will be just for learning so I'm not trying to make users devise, or users model
require 'net/http'
require 'json'
#url = 'http://api.openweathermap.org/data/2.5/weather?zip=#{zipcode}&appid=APIKEY'
#uri = URI(#url)
#response = Net::HTTP.get(#uri)
#output = JSON.parse(#response)
actually I figured it out, i needed to add
def zipcode
#zip_query = params[:zipcode]
if params[:zipcode] == ""
#zip_query = "Hey you forgot to enter a zipcode!"
elsif params[:zipcode]
# Do Api stuff
require 'net/http'
require 'json'
#url = 'http://api.openweathermap.org/data/2.5/weather?zip='+ #zip_query +'&appid=APIKEY'
#uri = URI(#url)
#response = Net::HTTP.get(#uri)
#output = JSON.parse(#response)
#name = #output['name']
# Check for empty return result
if #output.empty?
#final_output = "Error"
elsif !#output
#final_output = "Error"
else
#final_output = ((#output['main']['temp'] - 273.15) * 9/5 +32).round(2)
end
end
end
in the controller.rb file
and add
post "zipcode" => 'home#zipcode'
get "home/zipcode"
in the routes file
but I'm sure this is not the best practice
First of all Thanks for you all for helping programmers like me with your valuable inputs in solving day to day issues.
This is my first question in stack overflow as I am experiencing this problems from almost one week.
WE are building a crawler which crawls the specific websites and extract the contents from it, we are using mechanize to acheive this , as it was taking loads of time we decided to run the crawling process as a background task using resque with redis gem , but while sending the process to background I am experiencing the error as the title saying,
my code in lib/parsers/home.rb
require 'resque'
require File.dirname(__FILE__)+"/../index"
class Home < Index
Resque.enqueue(Index , :page )
def self.perform(page)
super (page)
search_form = page.form_with :name=>"frmAgent"
resuts_page = search_form.submit
total_entries = resuts_page.parser.xpath('//*[#id="PagingTable"]/tr[2]/td[2]').text
if total_entries =~ /(\d+)\s*$/
total_entries = $1
else
total_entries = "unknown"
end
start_res_idx = 1
while true
puts "Found #{total_entries} entries"
detail_links = resuts_page.parser.xpath('//*[#id="MainTable"]/tr/td/a')
detail_links.each do |d_link|
if d_link.attribute("class")
next
else
data_page = #agent.get d_link.attribute("href")
fields = get_fields_from_page data_page
save_result_page page.uri.to_s, fields
#break
end
end
site_done
rescue Exception => e
puts "error: #{e}"
end
end
and the superclass in lib/index.rb is
require 'resque'
require 'mechanize'
require 'mechanize/form'
class Index
#queue = :Index_queue
def initialize(site)
#site = site
#agent = Mechanize.new
#agent.user_agent = Mechanize::AGENT_ALIASES['Windows Mozilla']
#agent.follow_meta_refresh = true
#rows_parsed = 0
#rows_total = 0
rescue Exception => e
log "Unable to login: #{e.message}"
end
def run
log "Parsing..."
url = "unknown"
if #site.url
url = #site.url
log "Opening #{url} as a data page"
#page = #agent.get(url)
#perform method should be override in subclasses
#data = self.perform(#page)
else
#some sites do not have "datapage" URL
#for example after login you're already on your very own datapage
#this is to be addressed in 'perform' method of subclass
#data = self.perform(nil)
end
rescue Exception=>e
puts "Failed to parse URL '#{url}', exception=>"+e.message
set_site_status("error "+e.message)
end
#overriding method
def self.perform(page)
end
def save_result_page(url, result_params)
result = Result.find_by_sql(["select * from results where site_id = ? AND ref_code = ?", #site.id, utf8(result_params[:ref_code])]).first
if result.nil?
result_params[:site_id] = #site.id
result_params[:time_crawled] = DateTime.now().strftime "%Y-%m-%d %H:%M:%S"
result_params[:link] = url
result = Result.create result_params
else
result.result_fields.each do |f|
f.delete
end
result.link = url
result.time_crawled = DateTime.now().strftime "%Y-%m-%d %H:%M:%S"
result.html = result_params[:html]
fields = []
result_params[:result_fields_attributes].each do |f|
fields.push ResultField.new(f)
end
result.result_fields = fields
result.save
end
#rows_parsed +=1
msg = "Saved #{#rows_parsed}"
msg +=" of #{#rows_total}" if #rows_total.to_i > 0
log msg
return result
end
end
What's Wrong with this code?
Thanks