Why the domain changes? - twitter

Dear community,
I have another question about the changing domains. I am scrapping the links on a webpage and I find this link:
https://www.spar.si/sl_SI/splet/spar-slovenija-twitter.html
I would like to remain on the same domain but when my spider moves to this links, the link changes into this:
https://twitter.com/sparslovenija
Is there some way to tell the spider not to go to other domains? For now I see the solution to check the links in the "parse" function and tell it not to move on if the domain contains the word "Twitter" but I think that is not elegant solution. I would like to detect the domain change automatically whatever the change may be. Do you have any ideas?
Thank you in advance.
My code:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# encoding=UTF-8
import scrapy, urlparse, time, sys
from scrapy.http import Request
from scrapy.utils.response import get_base_url
from urlparse import urlparse, urljoin
from vacancies.items import JobItem
#We need that in order to force Slovenian pages instead of English pages. It happened at "http://www.g-gmi.si/gmiweb/" that only English pages were found and no Slovenian.
from scrapy.conf import settings
settings.overrides['DEFAULT_REQUEST_HEADERS'] = {'Accept':'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8','Accept-Language':'sl',}
#settings.overrides['DEFAULT_REQUEST_HEADERS'] = {'Accept':'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8','Accept-Language':'sl','en':q=0.8,}
#start_time = time.time()
# We run the programme in the command line with this command:
# scrapy crawl jobs -o urls.csv -t csv --logfile log.txt
# We get two output files
# 1) urls.csv
# 2) log.txt
# Url whitelist.
with open("Q:/SIIT/JV_Marko_Boro/Detector/kljucne_besede/url_whitelist.txt", "r+") as kw:
url_whitelist = kw.read().replace('\n', '').split(",")
url_whitelist = map(str.strip, url_whitelist)
# Tab whitelist.
# We need to replace character the same way as in detector.
with open("Q:/SIIT/JV_Marko_Boro/Detector/kljucne_besede/tab_whitelist.txt", "r+") as kw:
tab_whitelist = kw.read().decode(sys.stdin.encoding).encode('utf-8')
tab_whitelist = tab_whitelist.replace('Ŕ', 'č')
tab_whitelist = tab_whitelist.replace('╚', 'č')
tab_whitelist = tab_whitelist.replace('Ő', 'š')
tab_whitelist = tab_whitelist.replace('Ü', 'š')
tab_whitelist = tab_whitelist.replace('Ä', 'ž')
tab_whitelist = tab_whitelist.replace('×', 'ž')
tab_whitelist = tab_whitelist.replace('\n', '').split(",")
tab_whitelist = map(str.strip, tab_whitelist)
#File to write unique links
unique = open("G:/myVE/vacancies/unique_urls.txt", "wb")
class JobSpider(scrapy.Spider):
name = "jobs"
#Test sample of SLO companies
start_urls = [
"http://www.seltron.si/"
]
print start_urls
#Result of the programme is this list of job vacancies webpages.
jobs_urls = []
#I would like to see how many unique links we check on every page.
unique_urls = []
def parse(self, response):
response.selector.remove_namespaces()
#We take all urls, they are marked by "href". These are either webpages on our website either new websites.
urls = response.xpath('//#href').extract()
#Base url.
base_url = get_base_url(response)
#Loop through all urls on the webpage.
for url in urls:
url = url.strip()
#Ignore ftp.
if url.startswith("ftp"):
continue
#If url doesn't start with "http", it is relative url, and we add base url to get absolute url.
# -- It is true, that we may get some strange urls, but it is fine for now.
if not (url.startswith("http")):
url = urljoin(base_url,url)
#self.f.write(str(url.encode('utf-8')).strip() + "\n")
#If url represents a picture, a document, a compression ... we ignore it. We might have to change that because some companies provide job vacancies information in PDF.
if url.endswith((
#images
'.jpg', '.jpeg', '.png', '.gif', '.eps', '.ico', '.svg', '.tif', '.tiff',
'.JPG', '.JPEG', '.PNG', '.GIF', '.EPS', '.ICO', '.SVG', '.TIF', '.TIFF',
#documents
'.xls', '.ppt', '.doc', '.xlsx', '.pptx', '.docx', '.txt', '.csv', '.pdf', '.pd',
'.XLS', '.PPT', '.DOC', '.XLSX', '.PPTX', '.DOCX', '.TXT', '.CSV', '.PDF', '.PD',
#music and video
'.mp3', '.mp4', '.mpg', '.ai', '.avi', '.swf',
'.MP3', '.MP4', '.MPG', '.AI', '.AVI', '.SWF',
#compressions and other
'.zip', '.rar', '.css', '.flv', '.php',
'.ZIP', '.RAR', '.CSS', '.FLV', '.PHP',
)):
#self.f1.write("IMAGE " + str(url) + "\n")
continue
#If url includes characters like ?, %, &, # ... it is LIKELY NOT to be the one we are looking for and we ignore it.
#However in this case we exclude good urls like http://www.mdm.si/company#employment
if any(x in url for x in ['%', '~']):
continue
#We need to save original url for xpath, in case we change it later (join it with base_url)
url_xpath = url
#If url doesn't start with "http", it is relative url, and we add base url to get absolute url.
# -- It is true, that we may get some strange urls, but it is fine for now.
if not (url.startswith("http")):
url = urljoin(base_url,url)
#Counting unique links.
if url not in self.unique_urls:
self.unique_urls.append(url)
unique.write(str(url) + "\n")
#We don't want to go to other websites. We want to stay on our website, so we keep only urls with domain (netloc) of the company we are investigating.
if (urlparse(url).netloc == urlparse(base_url).netloc):
#The main part. We look for webpages, whose urls include one of the employment words as strings.
#We will check the tab of the url as well. This is additional filter, suggested by Dan Wu, to improve accuracy.
tabs = response.xpath('//a[#href="%s"]/text()' % url_xpath).extract()
# Sometimes tabs can be just empty spaces like '\t' and '\n' so in this case we replace it with [].
# That was the case when the spider didn't find this employment url: http://www.terme-krka.com/si/sl/o-termah-krka/o-podjetju-in-skupini-krka/zaposlitev/
tabs = [tab.encode('utf-8') for tab in tabs]
tabs = [tab.replace('\t', '') for tab in tabs]
tabs = [tab.replace('\n', '') for tab in tabs]
tab_empty = True
for tab in tabs:
if tab != '':
tab_empty = False
if tab_empty == True:
tabs = []
# -- Instruction.
# -- Users in other languages, please insert employment words in your own language, like jobs, vacancies, career, employment ... --
# Starting keyword_url is zero, then we add keywords as we find them in url. This is for tracking purposes.
keyword_url = ''
#if any(x in url for x in keywords):
for keyword in url_whitelist:
if keyword in url:
keyword_url = keyword_url + keyword + ' '
# If we find at least one keyword in url, we continue.
if keyword_url != '':
#1. Tabs are empty.
if tabs == []:
#print "No text for url: " + str(url)
#We found url that includes one of the magic words and also the text includes a magic word.
#We check url, if we have found it before. If it is new, we add it to the list "jobs_urls".
if url not in self.jobs_urls:
self.jobs_urls.append(url)
item = JobItem()
item["url"] = url
item["keyword_url"] = keyword_url
item["keyword_url_tab"] = ' '
item["keyword_tab"] = ' '
print url
#We return the item.
yield item
#2. There are texts, one or more.
else:
#For the same partial url several texts are possible.
for tab in tabs:
keyword_url_tab = ''
for key in tab_whitelist:
if key in tab:
keyword_url_tab = keyword_url_tab + key + ' '
if keyword_url_tab != '':
# keyword_url_tab starts with keyword_url from before, because we want to remember keywords from both url and tab.
keyword_url_tab = 'URL ' + keyword_url + ' TAB ' + keyword_url_tab
#if any(x in text for x in keywords):
#We found url that includes one of the magic words and also the tab includes a magic word.
#We check url, if we have found it before. If it is new, we add it to the list "jobs_urls".
if url not in self.jobs_urls:
self.jobs_urls.append(url)
item = JobItem()
item["url"] = url
item["keyword_url"] = ' '
item["keyword_url_tab"] = keyword_url_tab
item["keyword_tab"] = ' '
print url
#We return the item.
yield item
else:
for tab in tabs:
#print "TABS " + str(tabs)
#print "TAB " + str(type(tab))
keyword_tab = ''
for key in tab_whitelist:
#print "KEY " + str(type(key))
if key in tab:
keyword_tab = keyword_tab + key + ' '
if keyword_tab != '':
if url not in self.jobs_urls:
self.jobs_urls.append(url)
item = JobItem()
item["url"] = url
item["keyword_url"] = ' '
item["keyword_url_tab"] = ' '
item["keyword_tab"] = keyword_tab
print url
#We return the item.
yield item
#We don't put "else" sentence because we want to further explore the employment webpage to find possible new employment webpages.
#We keep looking for employment webpages, until we reach the DEPTH, that we have set in settings.py.
yield Request(url, callback = self.parse)

Related

CSV::MalformedCSVError: Unquoted fields do not allow \r or \n in Ruby

I have CSV's which I am trying to import into my oracle database but unfortunately I keep on getting the same error:
> CSV::MalformedCSVError: Unquoted fields do not allow \r or \n (line 1).
I know there are tons of similar questions which have been asked but none relate specifically to my issue other than this one, but unfortunately it didn't help.
To explain my scenario:
I have CSV's in which the rows don't always end with a value, but
rather, just a comma because it's a null value hence it stays blank.
I would like to import the CSV's regardless of whether the ending is with a comma or without a comma.
Here are the first 5 lines of my CSV with changed values due to privacy reasons,
id,customer_id,provider_id,name,username,password,salt,email,description,blocked,created_at,updated_at,deleted_at
1,,1,Default Administrator,admin,1,1," ",Initial default user.,f,2019-10-04 14:28:38.492000,2019-10-04 14:29:34.224000,
2,,2,Default Administrator,admin,2,1,,Initial default user.,,2019-10-04 14:28:38.633000,2019-10-04 14:28:38.633000,
3,1,,Default Administrator,admin,3,1," ",Initial default user.,f,2019-10-04 14:41:38.030000,2019-11-27 10:23:03.329000,
4,1,,admin,admin,4,1," ",,,2019-10-28 12:21:23.338000,2019-10-28 12:21:23.338000,
5,2,,Default Administrator,admin,5,1," ",Initial default user.,f,2019-11-12 09:00:49.430000,2020-02-04 08:20:06.601000,2020-02-04 08:20:06.601000
As you can see the ending is sometimes with or without a comma and this structure repeats quite often.
This is my code with which I have been playing around with:
def csv_replace_empty_string
Dir.foreach(Rails.root.join('db', 'csv_export')) do |filename|
next if filename == '.' or filename == '..' or filename == 'extract_db_into_csv.sh' or filename =='import_csv.rb'
read_file = File.read(Rails.root.join('db', 'csv_export', filename))
replace_empty_string = read_file.gsub(/(?<![^,])""(?![^,])/, '" "')
format_csv = replace_empty_string.gsub(/\r\r?\n?/, "\n")
# format_csv = remove_empty_lines.sub!(/(?:\r?\n)+\z/, "")
File.open(Rails.root.join('db', 'csv_export', filename), "w") {|file| file.puts format_csv }
end
end
I have tried using many different kinds of gsubs found online in similar forums, but it didn't help.
Here is my function for importing the CSV in the db:
def import_csv_into_db
Dir.foreach(Rails.root.join('db', 'csv_export')) do |filename|
next if filename == '.' or filename == '..' or filename == 'extract_db_into_csv.sh' or filename =='import_csv.rb'
filename_renamed = File.basename(filename, File.extname(filename)).classify
CSV.foreach(Rails.root.join('db', 'csv_export',filename), :headers => true, :skip_blanks => true) do |row|
class_name = filename_renamed.constantize
class_name.create!(row.to_hash)
puts "Insert on table #{filename_renamed} complete"
end
end
end
I have also tried the options provided by CSV such as :row_sep => :"\n" or :row_sep => "\r" but keep on getting the same error.
I am pretty sure I have some sort of thinking error, but I can't seem to figure it out.
I fixed the issue by using the following:
format_csv = replace_empty_string.gsub(/\r\r?\n?/, "\n")
This was originally #mgrims answer, but I had to adjust my code by further removing the :skip_blanks :row_sep options.
It is importing successfully now!

How to fix slow Nokogiri parsing

I have a Rake task in my Rails app which looks into a folder for an XML file, parses it, and saves it to a database. The code works OK, but I have about 2100 files totaling 1.5GB, and processing is very slow, about 400 files in 7 hours. There are approximately 600-650 contracts in each XML file, and each contract can have 0 to n attachments. I did not paste all values, but each contract has 25 values.
To speed-up the process I use Activerecord's Import gem, so I am building an array per file and when the whole file is parsed. I do a mass import to Postgres. Only if a record is found is it directly updated and/or a new attachment inserted, but this is like 1 out of 100000 records. This helps a little, instead of doing new record per contract, but now I see that the slow part is XML parsing. Can you please look if I am doing something wrong in my parsing?
When I tried to print the arrays I am building, the slow part was until it loaded/parsed whole file and starts printing array by array. Thats why I assume the probem with speed is in parsing as Nokogiri loads the whole XML before it starts.
require 'nokogiri'
require 'pp'
require "activerecord-import/base"
ActiveRecord::Import.require_adapter('postgresql')
namespace :loadcrz2 do
desc "this task load contracts from crz xml files to DB"
task contracts: :environment do
actual_dir = File.dirname(__FILE__).to_s
Dir.foreach(actual_dir+'/../../crzfiles') do |xmlfile|
next if xmlfile == '.' or xmlfile == '..' or xmlfile == 'archive'
page = Nokogiri::XML(open(actual_dir+"/../../crzfiles/"+xmlfile))
puts xmlfile
cons = page.xpath('//contracts/*')
contractsarr = []
#c =[]
cons.each do |contract|
name = contract.xpath("name").text
crzid = contract.xpath("ID").text
procname = contract.xpath("procname").text
conname = contract.xpath("contractorname").text
subject = contract.xpath("subject").text
dateeff = contract.xpath("dateefficient").text
valuecontract = contract.xpath("value").text
attachments = contract.xpath('attachments/*')
attacharray = []
attachments.each do |attachment|
attachid = attachment.xpath("ID").text
attachname = attachment.xpath("name").text
doc = attachment.xpath("document").text
size = attachment.xpath("size").text
arr = [attachid,attachname,doc,size]
attacharray.push arr
end
#con = Crzcontract.find_by_crzid(crzid)
if #con.nil?
#c=Crzcontract.new(:crzname => name,:crzid => crzid,:crzprocname=>procname,:crzconname=>conname,:crzsubject=>subject,:dateeff=>dateeff,:valuecontract=>valuecontract)
else
#con.crzname = name
#con.crzid = crzid
#con.crzprocname=procname
#con.crzconname=conname
#con.crzsubject=subject
#con.dateeff=dateeff
#con.valuecontract=valuecontract
#con.save!
end
attacharray.each do |attar|
attachid=attar[0]
attachname=attar[1]
doc=attar[2]
size=attar[3]
#at = Crzattachment.find_by_attachid(attachid)
if #at.nil?
if #con.nil?
#c.crzattachments.build(:attachid=>attachid,:attachname=>attachname,:doc=>doc,:size=>size)
else
#a=Crzattachment.new
#a.attachid = attachid
#a.attachname = attachname
#a.doc = doc
#a.size = size
#a.crzcontract_id=#con.id
#a.save!
end
end
end
if #c.present?
contractsarr.push #c
end
#p #c
end
#p contractsarr
puts "done"
if contractsarr.present?
Crzcontract.import contractsarr, recursive: true
end
FileUtils.mv(actual_dir+"/../../crzfiles/"+xmlfile, actual_dir+"/../../crzfiles/archive/"+xmlfile)
end
end
end
There are a number of problems with the code. Here are some ways to improve it:
actual_dir = File.dirname(__FILE__).to_s
Don't use to_s. dirname is already returning a string.
actual_dir+'/../../crzfiles', with and without a trailing path delimiter is used repeatedly. Don't make Ruby rebuild the concatenated string over and over. Instead define it once, but take advantage of Ruby's ability to build the full path:
File.absolute_path('../../bar', '/path/to/foo') # => "/path/bar"
So use:
actual_dir = File.absolute_path('../../crzfiles', __FILE__)
and then refer to actual_dir only:
Dir.foreach(actual_dir)
This is unwieldy:
next if xmlfile == '.' or xmlfile == '..' or xmlfile == 'archive'
I'd do:
next if (xmlfile[0] == '.' || xmlfile == 'archive')
or even:
next if xmlfile[/^(?:\.|archive)/]
Compare these:
'.hidden'[/^(?:\.|archive)/] # => "."
'.'[/^(?:\.|archive)/] # => "."
'..'[/^(?:\.|archive)/] # => "."
'archive'[/^(?:\.|archive)/] # => "archive"
'notarchive'[/^(?:\.|archive)/] # => nil
'foo.xml'[/^(?:\.|archive)/] # => nil
The pattern will return a truthy value if it starts with '.' or is equal to 'archive'. It's not as readable but it's compact. I'd recommend the compound conditional test though.
In some places, you're concatenating xmlfile, so again let Ruby do it once:
xml_filepath = File.join(actual_dir, xmlfile)
which will honor the file path delimiter for whatever OS you're running on. Then use xml_filepath instead of concatenating the name:
xml_filepath = File.join(actual_dir, xmlfile)))
page = Nokogiri::XML(open(xml_filepath))
[...]
FileUtils.mv(xml_filepath, File.join(actual_dir, "archive", xmlfile)
join is a good tool so take advantage of it. It's not just another name for concatenating strings, because it's also aware of the correct delimiter to use for the OS the code is running on.
You use a lot of instances of:
xpath("some_selector").text
Don't do that. xpath, along with css and search return a NodeSet, and text when used on a NodeSet can be evil in a way that'll hurtle you down a very steep and slippery slope. Consider this:
require 'nokogiri'
doc = Nokogiri::XML(<<EOT)
<root>
<node>
<data>foo</data>
</node>
<node>
<data>bar</data>
</node>
</root>
EOT
doc.search('//node/data').class # => Nokogiri::XML::NodeSet
doc.search('//node/data').text # => "foobar"
The concatenation of the text into 'foobar' can't be split easily and it's a problem we see here in questions too often.
Do this if you expect getting a NodeSet back because of using search, xpath or css:
doc.search('//node/data').map(&:text) # => ["foo", "bar"]
It's better to use at, at_xpath or at_css if you're after a specific node because then text will work as you'd expect.
See "How to avoid joining all text from Nodes when scraping" also.
There's a lot of replication that could be DRY'd. Instead of this:
name = contract.xpath("name").text
crzid = contract.xpath("ID").text
procname = contract.xpath("procname").text
You could do something like:
name, crzid, procname = [
'name', 'ID', 'procname'
].map { |s| contract.at(s).text }

Iterating through 9 times and I think I have a syntax error

#numbers = [1,2,3,4,5,6,7,8,9]
#numbers.each do |n|
if #order.card_type"n" != "none"
notes += "\n\nCard 1: " + #order.card_type1 + "\nPaper Weight: " + #order.paper_weight1 + "\nQuantity: " + #order.quantity1 + "\nInk Color #1: " + #order.ink_color11 + "\nInk Color #2: " + #order.ink_color12 + "\nWording: " + #order.wording1 + "\nReturn Address Printing: " + #order.return_address1 + "\nGuest Address Printing: " + #order.guest_address1.to_s + "\nEnvelope Liners: " + #order.envelope_liners1
end
end
I am not sure what I am doing wrong but I need to replace "n" with 1-9 in my controller. I can't figure out the right way to do this.
Just access it with [] notation and string manipulation:
(1..9).each do |n|
if #order["card_type#{n}"] != "none"
# ...
end
end
Failing that, you can also do:
#order.send("card_type#{n}")
There are a number of issues here.
The first, already solved, is that you'll need a send to access the value:
if #order.send(:"card_type#{n}") != "none" ...
The second is that each of the individual elements inside the string are also numerically-named:
Card 1: #{#order.card_type1}
Right now you'll be using the same value throughout the loop so you'll need to use send:
#{#order.send(:"card_type#{n}")}
Third, you can clean up the ginormous string concat using a heredoc:
if #order.send(:"card_type{n}") != "none"
notes += <-EOS
Card 1: #{#order.card_type1}
Paper Weight: #{#order.paper_weight1}
Quantity: #{#order.quantity1}
Ink Color #1: #{#order.ink_color11}
Ink Color #2: #{#order.ink_color12}
Wording: #{#order.wording1}
Return Address Printing: #{#order.return_address1}
Guest Address Printing: #{#order.guest_address1.to_s}
Envelope Liners: #{#order.envelope_liners1}
EOS
end
Fourth, rather than do this, use a collection in the first place, and throw in some helper methods that'll significantly clean up your mainline code. Here I'm using POROs but the exact same mechanics apply.
I'm assuming some sort of order item. I shortened the number of attributes for brevity:
class OrderItem
attr_accessor :type, :wording
def initialize(type, wording)
#type = type
#wording = wording
end
def valid_item?
type != 'none'
end
def item_info
<<-EOS
Card Type: #{type}
Wording: #{wording}
EOS
end
end
An order consists of a collection of those items. (You can limit to nine in a variety of ways, this is not reflected here.)
class Order
attr_accessor :items
def initialize(items)
#items = items
end
def valid_items
items.find_all &:valid_item?
end
end
In a Rails app each of these would be ActiveRecord models, stored in the DB.
To simulate an order I'm creating them manually and making sure one has the "none" type:
items = 4.times.collect { |n| OrderItem.new('not none', "wording #{n}") }
items[2].type = 'none'
order = Order.new(items)
To get a string containing the order info for non-"none" types:
output = order.valid_items.collect(&:item_info).join("\n--\n")
And if you print that out:
Card Type: not none
Wording: wording 0
--
Card Type: not none
Wording: wording 1
--
Card Type: not none
Wording: wording 3
Note the order item I set to "none" doesn't appear.
Now, I've taken liberties with naming (not knowing your domain) and there are various tweaks you'll need to make (like... why generate text output in a Rails app), but this shows one possible path you could take to clean up the code, and reduce the amount of thought it takes to understand the mainline code.
Even if you don't separate out the order items and stick with the numerically-named order attributes you can still isolate the confusing code to pull out the number-based attributes, figure out which ones are necessary (e.g., spin through the card types and get an array of the numbers that aren't none and use that to access all the other fields).

ROR/Hpricot: parsing a site and searching/comparing strings with regex

I just started with Ruby On Rails, and want to create a simple web site crawler which:
Goes through all the Sherdog fighters' profiles.
Gets the Referees' names.
Compares names with the old ones (both during the site parsing and from the file).
Prints and saves all the unique names to the file.
An example URL is: http://www.sherdog.com/fighter/Fedor-Emelianenko-1500
I am searching for the tag entries like <span class="sub_line">Dan Miragliotta</span>, unfortunately, additionally to the proper Referee names I need, the same kind of class is used with:
The date.
"N/A" when the referee name is not known.
I need to discard all the results with a "N/A" string as well as any string which contains numbers. I managed to do the first part but couldn't figure out how to do the second. I tried searching, thinking and experimenting, but, after experimenting and rewriting, managed to break the whole program and don't know how to (properly) fix it:
require 'rubygems'
require 'hpricot'
require 'simplecrawler'
# Set up a new crawler
sc = SimpleCrawler::Crawler.new("http://www.sherdog.com/fighter/Fedor-Emelianenko-1500")
sc.maxcount = 1
sc.include_patterns = [".*/fighter/.*$", ".*/events/.*$", ".*/organizations/.*$", ".*/stats/fightfinder\?association/.*$"]
# The crawler yields a Document object for each visited page.
sc.crawl { |document|
# Parse page title with Hpricot and print it
hdoc = Hpricot(document.data)
(hdoc/"td/span[#class='sub_line']").each do |span|
if span.inner_html == 'N/A' || Regexp.new(".*/\d\.*$").match(span.inner_html)
# puts "Test"
else
puts span.inner_html
#File.open("File_name.txt", 'a') {|f| f.puts(hdoc.span.inner_html) }
end
end
}
I would also appreciate help with ideas on the rest of the program: How do I properly read the current names from the file, if the program is run multiple times, and how do I make the comparisons for the unique names?
Edit:
After some proposed improvements, here is what I got:
require 'rubygems'
require 'simplecrawler'
require 'nokogiri'
#require 'open-uri'
sc = SimpleCrawler::Crawler.new("http://www.sherdog.com/fighter/Fedor-Emelianenko-1500")
sc.maxcount = 1
sc.crawl { |document|
doc = Nokogiri::HTML(document.data)
names = doc.css('td:nth-child(4) .sub-line').map(&:content).uniq.reject { |c| c == 'N/A' }
puts names
}
Unfortunately, the code still doesn't work - it returns a blank.
If instead of doc = Nokogiri::HTML(document.data), I write doc = Nokogiri::HTML(open(document.data)), then it gives me the whole page, but, parsing still doesn't work.
hpricot isn't maintained anymore. How about using nokogiri instead?
names = document.css('td:nth-child(4) .sub-line').map(&:content).uniq.reject { |c| c == 'N/A' }
=> ["Yuji Shimada", "Herb Dean", "Dan Miragliotta", "John McCarthy"]
A breakdown of the different parts:
document.css('td:nth-child(4) .sub-line')
This returns an array of html elements with the class name sub-line that are in the forth table column.
.map(&:content)
For each element in the previous array, return element.content (the inner html). This is equivalent to map({ |element| element.content }).
.uniq
Remove duplicate values from the array.
.reject { |c| c == 'N/A' }
Remove elements whose value is "N/A"
You would use array math (-) to compare them:
get referees from the current page
current_referees = doc.search('td[4] .sub_line').map(&:inner_text).uniq - ['N/A']
read old referees from the file
old_referees = File.read('old_referees.txt').split("\n")
use Array#- to compare them
new_referees = current_referees - old_referees
write the new file
File.open('new_referees.txt','w'){|f| f << new_referees * "\n"}
This will return all the names, ignoring dates and "N/A":
puts doc.css('td span.sub_line').map(&:content).reject{ |s| s['/'] }.uniq
It results in:
Yuji Shimada
Herb Dean
Dan Miragliotta
John McCarthy
Adding these to a file and removing duplicates is left as an exercise for you, but I'd use some magical combination of File.readlines, sort and uniq followed by a bit of File.open to write the results.
Here is the final answer
require 'rubygems'
require 'simplecrawler'
require 'nokogiri'
require 'open-uri'
# Mute log messages
module SimpleCrawler
class Crawler
def log(message)
end
end
end
n = 0 # Counters how many pages/profiles processed
sc = SimpleCrawler::Crawler.new("http://www.sherdog.com/fighter/Fedor-Emelianenko-1500")
sc.maxcount = 150000
sc.include_patterns = [".*/fighter/.*$", ".*/events/.*$", ".*/organizations/.*$", ".*/stats/fightfinder\?association/.*$"]
old_referees = File.read('referees.txt').split("\n")
sc.crawl { |document|
doc = Nokogiri::HTML(document.data)
current_referees = doc.search('td[4] .sub_line').map(&:text).uniq - ['N/A']
new_referees = current_referees - old_referees
n +=1
# If new referees found, print statistics
if !new_referees.empty? then
puts n.to_s + ". " + new_referees.length.to_s + " new : " + new_referees.to_s + "\n"
end
new_referees = new_referees + old_referees
old_referees = new_referees.uniq
old_referees.reject!(&:empty?)
# Performance optimization. Saves only every 10th profile.
if n%10 == 0 then
File.open('referees.txt','w'){|f| f << old_referees * "\n" }
end
}
File.open('referees.txt','w'){|f| f << old_referees * "\n" }

Ruby on Rails Strings: Find next character that is not a letter or number?

I'm making a mentions feature right now so when a user types in #, the next part they type for a username is clickable until a space appears. This is assuming they type in a username correctly, which only have letters and numbers. I need it to work though so if they type "Hi #jon!" that it finds the exclamation point (or any symbol that is not a letter or number) as not part of the username and excludes it instead of just looking for the following space.
This is what I have:
while #comment.content.include? "#" do
at = #comment.content.index('#')
space = #comment.content.index(' ', at)
length = space - at
usernotag = #comment.content[at + 1,length - 1]
userwtag = #comment.content[at,length]
#user = User.where(:username => usernotag.downcase).first
#mentioned_users.push(#user)
replacewith = "<a href='/" + usernotag + "'>*%^$&*)()_+!$" + usernotag + "</a>"
#comment.content = #comment.content.gsub(userwtag, replacewith)
end
#comment.content = #comment.content.gsub("*%^$&*)()_+!$", "#")
Any idea what I should do?
You should use a regular expression to parse/extract the user references:
# Transform comment content inline.
#comment.content.gsub!(/#[\w\d]+/) {|user_ref| link_if_user_reference(user_ref) }
#comment.save!
# Helper to generate a link to the user, if user exists
def link_if_user_reference(user_ref)
username = user_ref[1..-1]
return user_ref unless User.find_by_name(username)
link_to user_ref, "/users/#{user_name}"
# => produces link #username => /user/username
end
This assumes your usernames are restricted to alphanumeric characters as you said (letters or numbers). If you have other characters, you can add them to the set included in your regular expression.

Resources