I have coded the following ruby script:
require 'open-uri'
require 'Nokogiri'
require 'anemone'
class JobFox
attr_accessor :company_url,
:jobs_page,
:max_words,
:jobs_part,
:jobs_container,
:element_score,
:max_score,
:jobs
def calc_element_score(element)
self.element_score += (element['class'].to_s.scan(/job|career|position|opening/).count + element['id'].to_s.scan(/job|career|position|opening/).count) * 100
self.element_score += element.to_s.scan(/job|career|position|opening/).count * 5
element.css('a').each do |a|
self.element_score += a.to_s.scan(/job|career|position|opening/).count * 7
end
element.css('li').each do |li|
self.element_score += li.to_s.scan(/job|career|position|opening/).count * 5
end
element.css('h').each do |h|
self.element_score += h.to_s.scan(/job|career|position|opening/).count * 3
end
if self.element_score > self.max_score
self.max_score = self.element_score
self.jobs_part = element
end
if element.children.count == 0
self.element_score = 0
end
end
end
fox = JobFox.new
fox.company_url = 'http://www.website.com'
fox.max_words = 0
fox.jobs = []
# CRAWL THE WEBSITE TO FIND THE JOBS LINK
Anemone.crawl(fox.company_url, :depth_limit => 3) do |anemone|
anemone.on_pages_like(/job|jobs|career|careers|team|about/) do |page|
begin
puts "SCANNING: " + page.url.to_s
# SCAN THE HTML AND FIND THE OCCURENCES OF THE WORD "JOB"
source_html = open(page.url).read
job_occurences = source_html.scan(/job|jobs|work|position/).count
# IF MORE OCCURENCES THAN BEFORE, WE KEEP THE PAGE URL
if job_occurences > fox.max_words
fox.max_words = job_occurences
fox.jobs_page = page.url
end
rescue Exception => e
puts e
end
end
end
fox.jobs_container = Nokogiri::HTML(open(fox.jobs_page))
fox.element_score = fox.max_score = 0
fox.jobs_container.css('div, section').each do |container|
container.traverse do |element|
fox.calc_element_score(element)
end
end
fox.jobs_part.traverse do |element|
element.css('a').each do |job|
fox.jobs << job.text
end
end
# REMOVE POSSIBLE DUPLICATE ENTRIES
fox.jobs = fox.jobs.uniq
puts fox.jobs
and I am trying to port it to a rails application - not as a script/task but as a model function:
require 'anemone'
require 'open-uri'
require 'Nokogiri'
class Company < ActiveRecord::Base
has_many :jobs
accepts_nested_attributes_for :jobs
# CALCULATE THE RELATEDNESS OF EACH HTML ELEMENT
def calculate_element_score(element)
#jobs_expression = '/job|career|position|opening/'
#element_score += (element['class'].to_s.scan(#jobs_expression).count + element['id'].to_s.scan(#jobs_expression).count) * 100
#element_score += element.to_s.scan(#jobs_expression).count * 5
element.css('a').each do |a|
#element_score += a.to_s.scan(#jobs_expression).count * 7
end
element.css('li').each do |li|
#element_score += li.to_s.scan(#jobs_expression).count * 5
end
element.css('h').each do |h|
#element_score += h.to_s.scan(#jobs_expression).count * 3
end
if #element_score > #max_score
#max_score = #element_score
#jobs_part = element
end
if element.children.count == 0
#element_score = 0
end
end
# CRAWL THE WEBSITE TO FIND THE JOBS PAGE
def find_jobs_page
max_words = 0
Anemone.crawl(self.website, :depth_limit => 3) do |anemone|
anemone.on_pages_like(/job|jobs|career|careers|team|about/) do |page|
begin
# SCAN THE HTML AND FIND OCCURENCES OF RELEVANT WORDS
source_html = open(page.url).read
job_occurences = source_html.scan(/job|jobs|work|position/).count
# IF MORE OCCURENCES THAN BEFORE, KEEP THE PAGE URL
if job_occurences > max_words
max_words = job_occurences
self.jobs_page = page.url
end
rescue Exception => e
puts e
end
end
end
end
# FIND THE CONTAINER THAT HAS THE JOB LISTINGS
def find_jobs_container
jobs_container = Nokogiri::HTML(open(self.jobs_page))
#element_score = #max_score = 0
#jobs_expression = '/job|career|position|opening/'
jobs_container.css('div, section').each do |container|
container.traverse do |element|
self.calculate_element_score(element)
end
end
end
# ADD THE JOBS FROM THE PAGE TO THE COMPANY ASSOCIATION
def extract_jobs
#jobs_part.traverse do |element|
element.css('a').each do |job|
j = JOBS.new()
j.title = job.text
j.url = job
self.jobs << j
end
end
end
# THE METHOD TO FIND ALL THE JOBS FOR A COMPANY
def find_jobs
self.find_jobs_page
self.find_jobs_container
self.extract_jobs
end
end
Everything works just fine apart from the calculate_element_score method - #elements_score is always 0. Have I understood something entirely wrong regarding global variables?
Related
I have strings formatted like this cookie,sandwich(hotdog,burger),cake(chocolate(tiramisu)),candy. I'd like to convert into a tree-like structure (can be hash/array):
cookie
sandwich
|__hotdog
|__burger
cake
|__chocolate
|__tiramisu
candy
What's the simplest way to do this? I looked at Treetop but it seems overkill.
str = "cookie,sandwich(hotdog,burger(cheese,onions)),cake(chocolate(tiramisu)),candy"
Let's first create a helper method to split strings on those commas that are separated by strings containing balanced parentheses.
def separate(str)
start_idx = 0
left_paren_count = 0
str.each_char.with_index.with_object([]) do |(c,i),a|
case c
when '('
left_paren_count += 1
when ')'
left_paren_count -= 1
when ','
if left_paren_count.zero?
a << str[start_idx..i-1]
start_idx = i+1
end
end
end << str[start_idx..-1]
end
For example,
separate(str)
#=> ["cookie",
# "sandwich(hotdog,burger(cheese,onions))",
# "cake(chocolate(tiramisu))",
# "candy"]
separate("hotdog,burger(cheese,onions)")
#=> ["hotdog",
# "burger(cheese,onions)"]
separate("cheese,onions")
#=> ["cheese", "onions"]
We may now write a recursive expression.
def recurse(str)
separate(str).map do |s,h|
s1, s2 = s.split('(', 2)
s.include?('(') ? [s1, recurse(s2[0..-2])] : s1
end
end
Try it.
recurse(str)
#=> ["cookie",
# ["sandwich", ["hotdog", ["burger", ["cheese", "onions"]]]],
# ["cake", [["chocolate", ["tiramisu"]]]], "candy"]
class Node
attr_reader :name
attr_reader :children
def initialize(str)
#name, children_str = str.match(/^(\w+)\((.+)\)$/)&.captures
nodes = (children_str || str)
.scan(/(\w+\([\w,]+\))|(\w+\([\w,()]+\))|(\w+)/)
.flatten.compact
if #name.nil? && nodes.size == 1
#name = nodes.first
else
#children = nodes.map { |s|
Node.new(s)
}
end
end
def show_tree(level=0)
str = ""
unless #name.nil?
str = " " * (level - 1) if level > 1
str += "|__" if level > 0
str += "#{#name}\n"
level += 1
end
#children&.each do |node|
str += "#{node.show_tree(level)}"
end
str
end
end
test
foods = Node.new("cookie,sandwich(hotdog,burger),cake(chocolate(tiramisu)),candy")
puts foods.show_tree
# cookie
# sandwich
# |__hotdog
# |__burger
# cake
# |__chocolate
# |__tiramisu
# candy
rails = Node.new("root(config,db(migrate,seeds),lib,app(controllers(concerns,api),models(concerns),views))")
puts rails.show_tree
# root
# |__config
# |__db
# |__migrate
# |__seeds
# |__lib
# |__app
# |__controllers
# |__concerns
# |__api
# |__models
# |__concerns
# |__views
I'm completely new to Ruby on Rails but I think I might be missing something obvious. I'm currently working on a webapp that scrapes auction websites. The bones of the app was created by someone else. I'm currently trying to add new website scrapes but they don't seem to be working.
I have read through some of the Nokogiri documentation, checked that the scraped information is indeed not being written to the database (the seeded URLs that are being targeted have been when I check via the rails console) and used the chrome extension CSS Selector Tester to check that I am targeting the correct CSS selectors. The record ids are correct when I check via the rails console.
I have put what I think are the important sections of code below, but I might be missing something that I don't realise is important.
The websites I'm having issues with are Lot-art.com & Lot-Tissimo.com
Any help will be much appreciated.
Seeded URLs
Source.create(name: "Auction.fr", query_template: "https://www.auction.fr/_en/lot/search/?contexte=futures&tri=date_debut%20ASC&query={query}&page={page}")
Source.create(name: "Invaluable.co.uk", query_template: "https://www.invaluable.co.uk/search/api/search-results?keyword={query}&size=1000")
Source.create(name: "Interencheres.com", query_template: "http://www.interencheres.com/en/recherche/lot?search%5Bkeyword%5D={query}&page={page}")
Source.create(name: "Gazette-drouot.com", query_template: "http://catalogue.gazette-drouot.com/html/g/recherche.jsp?numPage={page}&filterDate=1&query={query}&npp=100")
Source.create(name: "Lot-art.com", query_template: "http://www.lot-art.com/auction-search/?form_id=lot_search_form&page=1&mq=&q={query}&ord=recent")
Source.create(name: "Lot-tissimo.com", query_template: "https://lot-tissimo.com/en/cmd=s&lwr=&ww={query}&xw=&srt=SN&wg=EUR&page={page}")
Scheduler code
require 'rufus-scheduler'
require 'nokogiri'
require 'mechanize'
require 'open-uri'
require "net/https"
s = Rufus::Scheduler.singleton
s.interval '1m' do
setting = Setting.find(1)
agent = Mechanize.new
agent.user_agent_alias = 'Windows Chrome'
agent.cookie_jar.load(File.join(Rails.root, 'tmp/cookies.yaml'))
List.all.each do |list|
number_of_new_items = 0
list.actions.each do |action|
url = action.source.query_template.gsub('{query}', action.list.query)
case action.source.id
when 1 # Auction.fr
20.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("div.list-products > ul > li").reverse.each do |item_data|
price = 0
if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
end
item = action.items.new(
title: item_data.at_css("h2").text.strip,
url: item_data.at_css("h2 a")["href"],
picture: item_data.at_css("div.image-wrap.lazy div.image img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 97 # Lot-Tissimo.com
5.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("#inhalt > .objektliste").reverse.each do |item_data|
# price = 0
# if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
# price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
# end
item = action.items.new(
title: item_data.at_css("div.objli-desc").text.strip,
url: item_data.at_css("td.objektliste-foto a")["href"],
picture: item_data.at_css("td.objektliste-foto a#lot_link img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 2 # Invaluable.co.uk
doc = JSON.parse(open(url).read)
doc["itemViewList"].reverse.each do |item_data|
puts item_data["itemView"]["photos"]
item = action.items.new(
title: item_data["itemView"]["title"],
url: "https://www.invaluable.co.uk/buy-now/" + item_data["itemView"]["title"].parameterize + "-" + item_data["itemView"]["ref"],
picture: item_data["itemView"]["photos"] != nil ? item_data["itemView"]["photos"].first["_links"]["medium"]["href"] : nil,
price: item_data["itemView"]["price"],
currency: item_data["itemView"]["currencySymbol"]
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
when 3 # Interencheres.com
# doc = Nokogiri::HTML(open(url))
5.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("div#lots_0 div.ligne_vente").reverse.each do |item_data|
price = 0
item = action.items.new(
title: item_data.at_css("div.ph_vente div.des_vente p a").text.strip,
url: "http://www.interencheres.com" + item_data.at_css("div.ph_vente div.des_vente p a")["href"],
picture: item_data.at_css("div.ph_vente div.gd_ph_vente img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 4 # Gazette-drouot.com
5.downto(1) do |page|
# doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc = agent.get(url.gsub('{page}', page.to_s))
# doc = agent.get(url)
doc.css("div#recherche_resultats div.lot_recherche").reverse.each do |item_data|
price = 0
picture = item_data.at_css("img.image_thumb_recherche") ? item_data.at_css("img.image_thumb_recherche")["src"] : nil
item = action.items.new(
title: item_data.at_css("#des_recherche").text.strip.truncate(140),
url: "http://catalogue.gazette-drouot.com/html/g/" + item_data.at_css("a.lien_under")["href"],
picture: picture,
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 69 # Lot-art.com
doc = agent.get(url)
doc.css("div.lot_list_holder").reverse.each do |item_data|
price = 0
item = action.items.new(
title: item_data.at_css("div.lot_list_body a")[0].text.strip.truncate(140),
url: item_data.at_css("div.lot_list_body")["href"],
picture: item_data.at_css("a.lot_list_thumb img") ["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
end
if number_of_new_items > 0 && setting.notifications_per_hour > setting.notifications_this_hour && setting.pushover_app_token.present? && setting.pushover_user_key.present?
url = URI.parse("https://api.pushover.net/1/messages.json")
req = Net::HTTP::Post.new(url.path)
req.set_form_data({
:token => setting.pushover_app_token,
:user => setting.pushover_user_key,
:message => "#{number_of_new_items} new items on #{list.name}!",
:url_title => "Check the list",
:url => "http://spottheauction.com/lists/#{list.id}"
})
res = Net::HTTP.new(url.host, url.port)
res.use_ssl = true
res.verify_mode = OpenSSL::SSL::VERIFY_PEER
res.start {|http| http.request(req) }
end
end
agent.cookie_jar.save(File.join(Rails.root, 'tmp/cookies.yaml'))
end
s.cron '0 * * * *' do
setting = Setting.find(1)
setting.notifications_this_hour = 0
setting.save
end
new just initializes an instance but doesn't save the instance. Do you actually call save somewhere?
You have two options:
Call save on the item:
item = action.items.new(
# ...
)
item.save
Or use create instead of new:
item = action.items.create(
# ...
)
In case someone else comes across this. I got the scraping of lot-art.com to work. It seemed that I was lacking specificity in the css selector for nokogiri to pull the correct data.
I am still having continuing issues with lot-tissimo although that appears to be from something else as other scrapers have issues such as scraping-hub's portia spiders.
I have a system, comprised of a BoundedQueue class, a Producer class that pushes items into a BoundedQueue object and a Consumer class that takes items out of the BoundedQueue object, the Producer and Consumer exist on separate threads. So that items aren't lost when they're pushed to a full queue I use a condition variable and a mutex to tell the Producer to wait until the queue has a space.
I need to create a test case that checks that the Producer is waiting when the queue is full, I'm not sure if I'm being ditsy or not but I just can't think of how to do this properly.
BoundedQueue class:
class BoundedQueue
attr_reader :count
def initialize(size)
#mutex = Mutex.new
#rep = Array.new(size) if size > 0
#size = size
#back = size-1
#front = 0
#count = 0
#condvar = ConditionVariable.new
end
def isEmpty?
#count == 0
end
def isFull?
#count == #size
end
def put(item)
#mutex.synchronize do
if item != nil
while isFull?
#condvar.wait(#mutex)
end
#back += 1
#back = 0 if #back >= #size
#rep[#back] = item
#count += 1
end
end
end
def get
#mutex.synchronize do
result = nil
if (!isEmpty?)
result = #rep[#front]
#rep[#front] = nil
#front += 1
#front = 0 if #front >= #size
#count -= 1
#condvar.signal
end
result
end
end
end
Producer class:
class Producer
def initialize(id, no_items, queue)
#id = "Producer#{id}"
#no_items = no_items
#queue = queue
end
def produce
while #no_items != 0
#queue.put("Item #{#no_items} from #{#id} ")
puts "#{#id} putting item #{#no_items} into the queue "
#no_items -= 1
end
end
end
Consumer class:
class Consumer
def initialize(queue)
#queue = queue
end
def consume
while !#queue.isEmpty?
puts "#{#queue.get} consumed from the queue"
sleep(Random.rand(10))
end
end
end
The test case so far:
require "./boundedQueue.rb"
require "./producer.rb"
require "./consumer.rb"
mutex = Mutex.new
cv = ConditionVariable.new
test_queue = BoundedQueue.new(5)
puts "Creating the producer"
producerOne = Producer.new(0,7,test_queue)
puts "Creating consumer"
consumer = Consumer.new(test_queue)
puts "Creating threads"
a = Thread.new{
producerOne.produce
}
b = Thread.new{
consumer.consume
}
puts "Joining threads"
b.join
a.join
currently i am getting values from database with a query
created_ats = Snapshot.connection.select_all("SELECT created_at from snapshots WHERE snapshot_id >= '#{camera_id}_#{from_date}' AND snapshot_id <= '#{camera_id}_#{to_date}'")
This query is giving me all created_ats according to conditions.
i want to filter all these created_ats according to some inputs which users have been passed to database which are
days and times
"{"Monday":["3:0-7:0","15:0-17:30"],"Tuesday":[],"Wednesday":["11:0-16:0"],"Thursday":["5:0-10:0"],"Friday":["15:30-22:30"],"Saturday":[],"Sunday":[]}"
its just an example days and times are can be totally filled or partially.
there is a field named as INTERVAL. which is being considered as MINUTES.
Now what is the whole scenario, from query i am getting created_at which is general timestamps in ruby on rails.
i want to filter those created_at with the giving Days and timings provided along with everyday in info above. + adding X(interval) to that created at as well.
such as after refining between days and timings. we will have a list of created_ats so after that starting from first created_at will add x.minutes in that created at and will assure that (created_at+x.minutes) is also present in created_ats(which we got after days and hours refining), is YESS then save it else leave it.
what i have tried so far is this and its not working as i want it to work according to me it seems no error in that but still dont give me specific value
class SnapshotExtractor < ActiveRecord::Base
establish_connection "evercam_db_#{Rails.env}".to_sym
belongs_to :camera
require "rmega"
require "aws-sdk-v1"
require 'open-uri'
def self.connect_mega
storage = Rmega.login("#{ENV['MEGA_EMAIL']}", "#{ENV['MEGA_PASSWORD']}")
storage
end
def self.connect_bucket
access_key_id = "#{ENV['AWS_ACCESS_KEY']}"
secret_access_key = "#{ENV['AWS_SECRET_KEY']}"
s3 = AWS::S3.new(
access_key_id: access_key_id,
secret_access_key: secret_access_key,
)
bucket = s3.buckets["evercam-camera-assets"]
bucket
end
# def self.test
# snapshot_bucket = connect_bucket
# storage = connect_mega
# folder = storage.root.create_folder("dongi")
# s3_object = snapshot_bucket.objects["gpo-cam/snapshots/1452136326.jpg"]
# snap_url = s3_object.url_for(:get, {expires: 1.years.from_now, secure: true}).to_s
# File.open("formula.txt", 'w') { |file| file.write(snap_url) }
# open('image.jpg', 'wb') do |file|
# file << open(snap_url).read
# end
# folder.upload("image.jpg")
# end
def self.extract_snapshots
running = SnapshotExtractor.where(status: 1).any?
unless running
#snapshot_request = SnapshotExtractor.where(status: 0).first
#snapshot_request.update_attribute(:status, 1)
camera_id = #snapshot_request.camera_id
exid = Camera.find(camera_id).exid
mega_id = #snapshot_request.id
from_date = #snapshot_request.from_date.strftime("%Y%m%d")
to_date = #snapshot_request.to_date.strftime("%Y%m%d")
interval = #snapshot_request.interval
#days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
set_days = []
set_timings = []
index = 0
#days.each do |day|
if #snapshot_request.schedule[day].present?
set_days[index] = day
set_timings[index] = #snapshot_request.schedule[day]
index += 1
end
end
begin
created_ats = Snapshot.connection.select_all("SELECT created_at from snapshots WHERE snapshot_id >= '#{camera_id}_#{from_date}' AND snapshot_id <= '#{camera_id}_#{to_date}'")
created_at_spdays = refine_days(created_ats, set_days)
created_at_sptime = refine_times(created_at_spdays, set_timings, set_days)
created_at = refine_intervals(created_at_sptime, interval)
File.open("test.txt", 'w') { |file| file.write(created_at) }
storage = connect_mega
creatp = storage.root.create_folder("created_at")
creatp.upload("test.txt")
rescue => error
notify_airbrake(error)
end
begin
storage = connect_mega
snapshot_bucket = connect_bucket
new_folder = storage.root.create_folder("#{exid}")
folder = storage.nodes.find do |node|
node.type == :folder and node.name == "#{exid}"
end
folder.create_folder("#{mega_id}")
created_at.each do |snap|
snap_i = DateTime.parse(snap).to_i
s3_object = snapshot_bucket.objects["#{exid}/snapshots/#{snap_i}.jpg"]
if s3_object.exists?
snap_url = s3_object.url_for(:get, {expires: 1.years.from_now, secure: true}).to_s
File.open("formula_#{snap_i}.txt", 'w') { |file| file.write(snap_url) }
open('#{snap_i}.jpg', 'wb') do |file|
file << open(snap_url).read
end
folder.upload('#{snap_i}.jpg')
end
end
#snapshot_request.update_attribute(:status, 3)
rescue => error
error
end
end
# created_at
end
private
def self.refine_days(created_ats, days)
created_at = []
index = 0
created_ats.each do |single|
days.each do |day|
if day == Date.parse(single["created_at"]).strftime("%A")
created_at[index] = single["created_at"]
index += 1
end
end
end
created_at
end
def self.refine_times(created_ats, timings, days)
created_at = []
index = 0
day_index = 0
days_times = days.zip(timings.flatten)
one = 1
zero = 0
created_ats.each do |single|
days_times.each do |day_time|
if Date.parse(single).strftime("%A") == day_time[day_index]
start_time = DateTime.parse(day_time[one].split("-")[zero]).strftime("%H:%M")
end_time = DateTime.parse(day_time[one].split("-")[one]).strftime("%H:%M")
created_at_time = DateTime.parse(single).strftime("%H:%M")
if created_at_time >= start_time && created_at_time <= end_time
created_at[index] = single
index += 1
end
end
end
end
created_at
end
def self.refine_intervals(created_ats, interval)
created_at = [created_ats.first]
last_created_at = DateTime.parse(created_ats.last)
index = 1
index_for_dt = 0
length = created_ats.length
(1..length).each do |single|
if (DateTime.parse(created_at[index_for_dt]) + interval.minutes) <= last_created_at
temp = DateTime.parse(created_at[index_for_dt]) + interval.minutes
created_at[index] = temp.to_s
index_for_dt += 1
index += 1
end
end
created_at
end
end
I have this ruby script and I have a problem on the line 57..
please let me know where is the error
#!/usr/bin/ruby
class CommonLog
# init method takes log filename, computes frequency counts
# for ips, urls & statuses
def initialize(log)
f = File.open(log)
#filename = log
#filesize = f.size
#ip_counts = Hash.new(0)
#url_counts = Hash.new(0)
#status_counts = Hash.new(0)
#total_records = 0
f.readlines.each do |line|
tokens = line.split(' ')
ip = tokens[0]
url = tokens[-4]
status = tokens[-2]
#ip_counts[ip] += 1
#url_counts[url] += 1
#status_counts[status] += 1
#total_records += 1
end
f.close
end
# displays filename and bytesize
def file_info
"Filename: #{#filename}, Bytes Transferred: #{#filesize}"
end
# draws ip histogram
def ip_hist
#ip_counts.each do |ip, freq|
puts "#{ip}: #{'*'*freq}"
end
puts file_info
end
# draws url histogram
def url_hist
#url_counts.each do |url,freq|
puts "#{url} #{'*'*freq}"
end
puts file_info
end
# draws list of statuses
def status_list
msg = ""
sorted_status_codes = #status_counts.keys.sort
sorted_status_codes do |code|
count = #status_counts[code]
pct = ((count.to_f/#total_records)*100).to_i
msg += "#{code}: #{percentage}%\n"
end
puts msg
puts file_info
end
end
def test
log = CommonLog.new('**TEST LOG FILE PATH**')
log.ip_histogram
log.url_histogram
log.status_codes
end
test()
Thank you for your help in advance.. your help would be much appreciated..
The log file is on the server, so I removed the path because you won't be able to reach it ..
Here are the message I get back after I run the file
/1.rb:57:in `status_list': undefined method `keys' for nil:NilClass (NoMethodError)
from ./1.rb:73:in `test'
from ./1.rb:75:in `<main>'