Scrapy does not follow the Request url - url

Following is the code. Basically, I am scraping movie info. from IMDB.com. But somehow the Request doesn't scrap the url, which is in the object "addr". The "print" I put into the parse_item2 simply does not show up.
This drives me crazy. I spent hours on it. Could anyone with some experience help? Thank you so much.
# code for the spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request, Response
from beta.items import BetaItem
import urllib2
class AlphaSpider(CrawlSpider):
name = 'alpha'
allowed_domains = ['amazon.com','imdb.com']
start_urls = ['http://www.imdb.com/search/title?at=0&sort=boxoffice_gross_us&title_type=feature&year=2005,2005']
rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//td/a',), allow=('/title/')), callback='parse_item1'),
)
def parse_item1(self, response):
sel = Selector(response)
item = BetaItem()
idb = sel.xpath('//link[#rel="canonical"]/#href').extract()
idb = idb[0].split('/')[-2]
item['idb'] = idb
title = sel.xpath('//h1[#class="header"]/span[#class="itemprop"]/text()').extract()
item['title'] = title
addr = 'http://www.imdb.com/title/' + idb + '/business'
request = Request(addr, callback=self.parse_item2)
request.meta['item'] = item
return request
def parse_item2(self, response):
print 'I am here'
item = response.meta['item']
sel = Selector(response)
# BLA BLA BLA
return item

The reason for the problem is indeed as Blender said in his comment above. It takes quite some time to crawl stuff for some particular request.

Related

crawling the stack overflow and getting url of any specified question

import requests
from bs4 import BeautifulSoup
name = []
code = []
i=1
search1 = str(input('enter the word to search'))
search1= "+".join(search1.split())
address = "https://stackoverflow.com/search?q=" + search1
print(address)
res = requests.get(address)
soup = BeautifulSoup(res.text, "html.parser")
links = soup.find_all('a', class_='question-hyperlink')
for link in links:
requests.get(address)
print('link:' +link['href'])
soup = BeautifulSoup(res.text, "html.parser")
This is my code. I want to extractURLl of my specified asked topic question ofStackOverfloww but the output I am getting is very random.
I want to extract the URL of any specified question, based on the given class.

how to pass additional parametes to spider parse function in scrapy during web scraping

I am trying to pass additional information to the parse function but it is giving a type error.
TypeError: parse() got an unexpected keyword argument 'body'
i am unable to resolve this issue.
"""
return [scrapy.Request(url=website.search_url.format(prod), callback=self.parse,
cb_kwargs = {"body":website.body_xpath,"product_list":website.products_list_xpath,
"names":website.products_name_xpath,"selling_price":website.selling_price_xpath,
"market_price":website.market_price_xpath}) for website in websites for prod in modified_products]
def parse(self, response):
body = response.cb_kwargs.get("body")
product_list = response.cb_kwargs.get("product_list")
name = response.cb_kwargs.get("names")
selling_price = response.cb_kwargs.get("selling_price")
market_price = response.cb_kwargs.get("market_price")
"""
I forgot to write those names in parse function definition, after adding them i am getting the correct result. Thanks for having a look at it.
"""
return [scrapy.Request(url=website.search_url.format(prod), callback=self.parse,
cb_kwargs = dict(body = website.body_xpath, product_list = website.products_list_xpath,
name = website.products_name_xpath, selling_price = website.selling_price_xpath,
market_price = website.market_price_xpath)) for website in websites for prod in modified_products]
def parse(self, response, body, product_list, name, selling_price, market_price):
body = response.cb_kwargs["body"]
product_list = response.cb_kwargs["product_list"]
name_ = response.cb_kwargs["name"]
selling_price_ = response.cb_kwargs["selling_price"]
market_price_ = response.cb_kwargs["market_price"]
"""

Calling HTML tags from another file in Scrapy spider class and parse them

I am new to Python and Scrapy. So, I am not aware if the following scenario can be achieved.
Therefore, let me know if it's possible.
I know that this is how usually Scrapy works :
import scrapy
from scrapy.loader import ItemLoader
from ..items import Allitems
class newspider(scrapy.Spider):
name = "my_code"
start_urls = ['URL_Name']
def parse(self, response):
class_name = ".product-list--list-item"
product_name_tag = "./div/div[1]/div/div[1]/div[1]/div[1]/h3/a/text()"
selector1 = response.css(class_name)
for items in selector1:
loader = ItemLoader(item=Allitems(), selector=items)
loader.add_xpath('Product_Name', product_name_tag)
yield loader.load_item()
How to create a file in which I can add all the HTML tags which are declared in variables and call them in a Spider class and parse those HTML tags via selectors ?
For example -
Sample.txt :
class_name = ".product-list--list-item"
product_name_tag = "./div/div[1]/div/div[1]/div[1]/div[1]/h3/a/text()"
Spider file :
import scrapy
from scrapy.loader import ItemLoader
from ..items import Allitems
class newspider(scrapy.Spider):
name = "my_code"
start_urls = ['URL_Name']
def parse(self, response):
*** Call Sample.txt ***
selector1 = response.css(class_name)
for items in selector1:
loader = ItemLoader(item=Allitems(), selector=items)
loader.ad+d_xpath('Product_Name', product_name_tag)
yield loader.load_item()
In all the websites, the HTML tags might change. Therefore, I want to keep the HTML tags declaration separately in a file and parsing & scraping of each item in another file.
I would recommend creating a sample.py (instead of your sample.txt) so you can import the variables. You can still do it with .txt, but you would have to load it as text file and parse through it. While having a sample.py allows for importing precisely:
import scrapy
from scrapy.loader import ItemLoader
from ..items import Allitems
from sample import class_name, product_name_tag
class newspider(scrapy.Spider):
name = "my_code"
start_urls = ['URL_Name']
def parse(self, response):
selector1 = response.css(class_name)
for items in selector1:
loader = ItemLoader(item=Allitems(), selector=items)
loader.ad+d_xpath('Product_Name', product_name_tag)
yield loader.load_item()

Odoo 10 Print multiple reports with one click

Is there any way to print more than one report by one click in Odoo 10? For example i have one report template and some employees. Each employee should has own report with same template. And i want to print all report by one button.
I created template and python file. But could not print for each employee. Can you help me please?
you must have to extend qweb report like this
class orders_print(report_sxw.rml_parse):
def __init__(self, cr, uid, name, context):
super(orders_print,self).__init__(cr,uid, name, context=context)
class SaleOrderReport(models.AbstractModel):
_name = 'report.sale.report_saleorder'
_inherit = 'report.abstract_report'
_template = 'sale.report_saleorder'
_wrapped_report_class = orders_print
#api.multi
def render_html(self, docids,data=None,context=None):
report_obj = self.env['report']
report = report_obj._get_report_from_name('sale.report_saleorder')
if data['data']:
d = ast.literal_eval(data['data']['options'])
docs = self.env[report.model].browse(d['ids'])
docargs = {
'doc_ids': self._ids,
'doc_model': 'sale.order',
'docs': docs,
}
return report_obj.render('sale.report_saleorder', docargs)
else:
return super(SaleOrderReport,self).render_html(docids,data=data)

Django-Admin Exception Value: 'DeclarativeFieldsMetaclass' object is not iterable

I have one form in forms.py
class EmailForm(forms.Form):
recipient = forms.CharField(max_length=14, min_length=12,
widget=forms.TextInput(attrs=require))
message = forms.CharField(max_length=140, min_length=1,
widget=forms.Textarea(attrs={'cols': 30, 'rows': 5}))
and my site url is
admin.autodiscover()
urlpatterns = patterns('', (r'^admin/(.*)',
include(admin.site.urls)),)
now I want it to be shown on admin interface
I tried so far
First attempt
from myapps.forms import EmailForm
class EmailAdmin(admin.ModelAdmin):
form = EmailForm
did not work Exception Value:
'DeclarativeFieldsMetaclass' object is not iterable
Second attempt
and now I followed http://docs.djangoproject.com/en/dev/ref/contrib/admin/#django.contri...
but could not get help
class EmailAdmin(admin.ModelAdmin):
def my_view(self,request):
return admin_my_view(request,self)
def get_urls(self):
urls = super(SmsAdmin, self).get_urls()
my_urls = patterns('',(r'^my_view/
$',self.admin_site.admin_view(self.my_view)))
return my_urls + urls
def admin_my_view(request, model_admin):
opts = model_admin.model._meta
admin_site = model_admin.admin_site
has_perm = request.user.has_perm(opts.app_label \
+ '.' + opts.get_change_permission())
context = {'admin_site': admin_site.name,
'title': "My Custom View",
'opts': opts,
'root_path': '/%s' % admin_site.root_path,
'app_label': opts.app_label,
'has_change_permission': has_perm}
template = 'admin/demo_app/admin_my_view.html'
return render_to_response(template,
context,context_instance=RequestContext(request))
admin.site.register(EmailForm,EmailAdmin)
and when I run server and type on browser localhost:8000/admin
and hit enter button
Exception Value:
'DeclarativeFieldsMetaclass' object is not iterable
and second time just after first time when I again enter then it show
me the admin page but I can't see my EmailAdmin in admin interface..
Just help me or suggest me any link.
Thanks
(This is my attempt at reformatting your model code):
class EmailForm(forms.Form):
recipient = forms.CharField(max_length=14, min_length=12,
widget=forms.TextInput(attrs=require))
message = forms.CharField(max_length=140, min_length=1,
widget=forms.Textarea(attrs={'cols': 30, 'rows': 5}))
I would put my money on the bit that says "attrs=require" -- if that's not a typo.
What you want instead is something like this:
recipient = forms.CharField(max_length=14, min_length=12,
widget=forms.TextInput(), required=True)

Resources