I am trying to parse twitter desired outputs are URL of tweet, date of tweet, Sender and the twit itself. there no errors but the result is empty. i could not find the problem the code is hereunder:if you could help me out it would be great hence i would be using the data in my thesis
from bs4 import BeautifulSoup
import urllib.request
import openpyxl
wb= openpyxl.load_workbook('dene1.xlsx')
sheet=wb.get_sheet_by_name('Sayfa1')
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
url = 'https://twitter.com/search?q=TURKCELL%20lang%3Atr%20since%3A2012-01-01%20until%3A2012-01-09&src=typd&lang=tr'
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
soup = BeautifulSoup(respData , 'html.parser')
gdata = soup.find_all("div", {"class": "content"})
for item in gdata:
try:
items2 = item.find('a', {'class': 'tweet-timestamp js-permalink js-nav js-tooltip'})
items21=items2.get('href')
items22=items2.get('title')
except:
pass
try:
items1 = item.find('span', {'class': 'username js-action-profile-name'}).text
except:
pass
try:
items3 = item.find('p', {'class': 'TweetTextSize js-tweet-text tweet-text'}).text
sheet1=sheet.append([items21, items22,items1,items3])
except:
pass
wb.save('dene1.xlsx')
regards
Eevery line in your excepts causes an error at least once, you never see them as you use blank excepts to literally catch every exception:
import urllib.request
from bs4 import BeautifulSoup
headers = {
'User-Agent': "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}
url = 'https://twitter.com/search?q=TURKCELL%20lang%3Atr%20since%3A2012-01-01%20until%3A2012-01-09&src=typd&lang=tr'
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
soup = BeautifulSoup(respData, 'html.parser')
gdata = soup.find_all("div", {"class": "content"})
for item in gdata:
items2 = item.find('a', {'class': 'tweet-timestamp js-permalink js-nav js-tooltip'}, href=True)
if items2:
items21 = items2.get('href')
items22 = items2.get('title')
print(items21)
print(items22)
items1 = item.find('span', {'class': 'username js-action-profile-name'})
if items1:
print(items1.text)
items3 = item.find('p', {'class': 'TweetTextSize js-tweet-text tweet-text'})
if items3:
print(items3.text)
Now you can see lots of output.
Related
import requests
from bs4 import BeautifulSoup
name = []
code = []
i=1
search1 = str(input('enter the word to search'))
search1= "+".join(search1.split())
address = "https://stackoverflow.com/search?q=" + search1
print(address)
res = requests.get(address)
soup = BeautifulSoup(res.text, "html.parser")
links = soup.find_all('a', class_='question-hyperlink')
for link in links:
requests.get(address)
print('link:' +link['href'])
soup = BeautifulSoup(res.text, "html.parser")
This is my code. I want to extractURLl of my specified asked topic question ofStackOverfloww but the output I am getting is very random.
I want to extract the URL of any specified question, based on the given class.
I am very confused by all posts about chaining over url requests that I can't fix it by myself only.
I am trying to take some info from a web page and furthermore open a new "a href" where are stored further information I want.
from bs4 import BeautifulSoup
import requests
from csv import reader, writer, DictWriter, DictReader
source = requests.get("http://www.bda-ieo.it/test/Group.aspx?Lan=Ita")
soup = BeautifulSoup(source.text, "html.parser")
titolo_sezione = ""
table_row = ""
with open("genere.txt", "w", newline="") as txt_file:
headers = ["GRUPPO MERCEOLOGICO", "CODICE MERCEOLOGICO", "ALIMENTO"]
csv_writer = DictWriter(txt_file, fieldnames=headers, delimiter=';')
csv_writer.writeheader()
for table_row in soup.find("table", id="tblResult").find_all("tr"):
className = ""
if table_row.get("class"):
className = table_row.get("class").pop()
if className == "testobold":
titolo_sezione = table_row.text
if className == "testonormale":
for cds in table_row.find_all("td"):
url = cds.get("a")
urls = requests.get("http://www.bda-ieo.it/test/Groupfood.aspx?Lan=Ita + url")
dage = BeautifulSoup(urls.text, "html.parser")
alimenti = ""
for alimenti in dage:
id_alimento, destra = alimenti.find_all("td")
codice = id_alimento.text
nome = destra.text
href = destra.a.get("href")
print(f'{titolo_sezione}; {id_alimento.text}; {nome.text}')
The variable urls doesn't open any further page. Somebody can help me to make it clear?
I am stuck on that.
Thank you
Mass
You need to re-work some of the logic in there, as well as read up a bit about string formatting. I made notes of where I made changes, and I'm not sure what exactly you are looking for as an output, but this may get you going.
from bs4 import BeautifulSoup
import requests
from csv import reader, writer, DictWriter, DictReader
source = requests.get("http://www.bda-ieo.it/test/Group.aspx?Lan=Ita")
soup = BeautifulSoup(source.text, "html.parser")
titolo_sezione = ""
table_row = ""
with open("c:/test/genere.txt", "w", newline="") as txt_file:
headers = ["GRUPPO MERCEOLOGICO", "CODICE MERCEOLOGICO", "ALIMENTO"]
csv_writer = DictWriter(txt_file, fieldnames=headers, delimiter=';')
csv_writer.writeheader()
for table_row in soup.find("table", id="tblResult").find_all("tr"):
className = ""
if table_row.get("class"):
className = table_row.get("class").pop()
if className == "testobold":
titolo_sezione = table_row.text
if className == "testonormale":
for cds in table_row.find_all("a", href=True): #<-- the hrefs are in the <a> tags within the <td> tags. So you need to find <a> tags that have href
url = cds['href'] #<--- get the href
urls = requests.get("http://www.bda-ieo.it/test/%s" %url) #<--- use that stored string to put into the new url you'll be using
dage = BeautifulSoup(urls.text, "html.parser") #<-- create BeautifulSoup object with that response
dageTbl = dage.find("table", id="tblResult") #<--- find the table in this html now
if dageTbl: #<--- if there is that table
for alimenti in dageTbl.find_all('tr', {'class':'testonormale'}): #<--- find the rows with the specific class
id_alimento, destra = alimenti.find_all("td")
codice = id_alimento.text
nome = destra.text.strip() #<--- added strip() to remove whitespace
href = destra.a.get("href")
print(f'{titolo_sezione}; {codice}; {nome}') #<--- fixed string formatting here too
Output:
PATATE; 381; PATATE
PATATE; 50399; PATATE DOLCI
PATATE; 380; PATATE NOVELLE
PATATE; 3002; PATATE, FECOLA
PATATE; 100219; PATATE, POLVERE ISTANTANEA
PATATE; 382; PATATINE IN SACCHETTO
PATATE; 18; TAPIOCA
VEGETALI; 303; ASPARAGI DI BOSCO
VEGETALI; 304; ASPARAGI DI CAMPO
VEGETALI; 305; ASPARAGI DI SERRA
VEGETALI; 700484; ASPARAGI IN SCATOLA
VEGETALI; 8035; GERMOGLI DI ERBA MEDICA
...
I'm iterating over a for loop of a list of ingredients (strings) calling a http post request for each one to obtain their nutritional info.
The following works.
data = '{"query": "black olives"}'
r = requests.post(url, headers = headers, data = data)
body = json.loads(r.text)
But this:
for ingredient in ingredients:
data = '{"query": ' + ingredient + '}'
r = requests.post(url, headers = headers, data = data)
body = json.loads(r.text)
gives the error:
{'message': 'Unexpected token r in JSON at position 10'}
How do I fix it?
Edit: It works now.
It is safer to construct JSON from a dict.
Try this:
import json
for ingredient in ingredients:
data = {"query": ingredient}
# Two options : let requests do the json conversion of the dict
r = requests.post(url, headers = headers, json=data)
# or do it yourself
# r = requests.post(url, headers = headers, data=json.dumps(data))
body = json.loads(r.text)
Following is the code. Basically, I am scraping movie info. from IMDB.com. But somehow the Request doesn't scrap the url, which is in the object "addr". The "print" I put into the parse_item2 simply does not show up.
This drives me crazy. I spent hours on it. Could anyone with some experience help? Thank you so much.
# code for the spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request, Response
from beta.items import BetaItem
import urllib2
class AlphaSpider(CrawlSpider):
name = 'alpha'
allowed_domains = ['amazon.com','imdb.com']
start_urls = ['http://www.imdb.com/search/title?at=0&sort=boxoffice_gross_us&title_type=feature&year=2005,2005']
rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//td/a',), allow=('/title/')), callback='parse_item1'),
)
def parse_item1(self, response):
sel = Selector(response)
item = BetaItem()
idb = sel.xpath('//link[#rel="canonical"]/#href').extract()
idb = idb[0].split('/')[-2]
item['idb'] = idb
title = sel.xpath('//h1[#class="header"]/span[#class="itemprop"]/text()').extract()
item['title'] = title
addr = 'http://www.imdb.com/title/' + idb + '/business'
request = Request(addr, callback=self.parse_item2)
request.meta['item'] = item
return request
def parse_item2(self, response):
print 'I am here'
item = response.meta['item']
sel = Selector(response)
# BLA BLA BLA
return item
The reason for the problem is indeed as Blender said in his comment above. It takes quite some time to crawl stuff for some particular request.
Using the following code I received an error:
TypeError: POST data should be bytes or an iterable of bytes. It cannot be str
Second concern, I am not sure if I specified my user-agent correctly, here's my user-agent in whole: Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4. I gave my best shot as I defined the user-agent in the script.
import urllib.parse
import urllib.request
url = 'http://getliberty.org/contact-us/'
user_agent = 'Mozilla/5.0 (compatible; Chrome/22.0.1229.94; Windows NT)'
values = {'Your Name' : 'Horatio',
'Your Email' : '6765Minus4181#gmail.com',
'Subject' : 'Hello',
'Your Message' : 'Cheers'}
headers = {'User-Agent': user_agent }
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data)
response = urllib.request.urlopen(req)
the_page = response.read()
I am aware of this similar question, TypeError: POST data should be bytes or an iterable of bytes. It cannot be str, but am too new for the answer to be much help.
data = urllib.parse.urlencode(values)
type(data) #this returns <class 'str'>. it's a string
The urllib docs say for urllib.request.Request(url, data ...):
The urllib.parse.urlencode() function takes a mapping or sequence of 2-tuples and returns a string in this format. It should be encoded to bytes before being used as the data parameter. etc etc
(emphasis mine)
So you have a string that looks right, what you need is that string encoded into bytes. And you choose the encoding.
binary_data = data.encode(encoding)
in the above line: encoding can be 'utf-8' or 'ascii' or a bunch of other things. Pick whichever one the server expects.
So you end up with something that looks like:
data = urllib.parse.urlencode(values)
binary_data = data.encode(encoding)
req = urllib.request.Request(url, binary_data)
You can try with requests module as an alternative solution
import json
import requests
url = 'http://getliberty.org/contact-us/'
user_agent = 'Mozilla/5.0 (compatible; Chrome/22.0.1229.94; Windows NT)'
values = {
'Your Name' : 'Horatio',
'Your Email' : '6765Minus4181#gmail.com',
'Subject' : 'Hello',
'Your Message' : 'Cheers'
}
headers = {'User-Agent': user_agent, 'Content-Type':'application/json' }
data = json.dumps(values)
request = requests.post(url, data=data, headers=headers)
response = request.json()