import requests
from bs4 import BeautifulSoup
name = []
code = []
i=1
search1 = str(input('enter the word to search'))
search1= "+".join(search1.split())
address = "https://stackoverflow.com/search?q=" + search1
print(address)
res = requests.get(address)
soup = BeautifulSoup(res.text, "html.parser")
links = soup.find_all('a', class_='question-hyperlink')
for link in links:
requests.get(address)
print('link:' +link['href'])
soup = BeautifulSoup(res.text, "html.parser")
This is my code. I want to extractURLl of my specified asked topic question ofStackOverfloww but the output I am getting is very random.
I want to extract the URL of any specified question, based on the given class.
Related
I am using this script to get data about covid-19 from pubmed
from Bio import Entrez
def search(query):
Entrez.email = 'your.email#example.com'
handle = Entrez.esearch(db='pubmed',
sort='relevance',
retmax='20',
retmode='xml',
term=query)
results = Entrez.read(handle)
return results
def fetch_details(id_list):
ids = ','.join(id_list)
Entrez.email = 'your.email#example.com'
handle = Entrez.efetch(db='pubmed',
retmode='xml',
id=ids)
results = Entrez.read(handle)
return results
if __name__ == '__main__':
results = search('covid-19')
id_list = results['IdList']
papers = fetch_details(id_list)
for i, paper in enumerate(papers['PubmedArticle']):
print("{}) {}".format(i+1, paper['MedlineCitation']['Article']['ArticleTitle']))
I get results in console but what I want is to automatically download files like XML files or text files of articles, any suggestions please on how to do that I googled it but nothing found
You can add this code at the end to save to a JSON file
#write to file
import json
with open('file.json', 'w') as json_file:
json.dump(papers, json_file)
I have tried to apply the advice in this question but I still don't seem to get the code to work. I am trying to pass two variables
How to pass variables in python flask to mysqldb?
cur = mysql.connection.cursor()
for row in jdict:
headline = row['title']
url = row['url']
cur.execute("INSERT INTO headlinetitles (Title, Url) VALUES ('%s','%s');",str(headline),str(url))
mysql.connection.commit()
I get the error TypeError: execute() takes from 2 to 3 positional arguments but 4 were given
I found the solution. Maybe this will help someone.
cur = mysql.connection.cursor()
for row in jdict:
headline = row['title']
url = row['url']
sqldata = ("INSERT INTO headlinetitles (Title, Url) VALUES ('%s','%s');",str(headline),str(url))
cur.execute = (sqldata)
mysql.connection.commit()
I am very confused by all posts about chaining over url requests that I can't fix it by myself only.
I am trying to take some info from a web page and furthermore open a new "a href" where are stored further information I want.
from bs4 import BeautifulSoup
import requests
from csv import reader, writer, DictWriter, DictReader
source = requests.get("http://www.bda-ieo.it/test/Group.aspx?Lan=Ita")
soup = BeautifulSoup(source.text, "html.parser")
titolo_sezione = ""
table_row = ""
with open("genere.txt", "w", newline="") as txt_file:
headers = ["GRUPPO MERCEOLOGICO", "CODICE MERCEOLOGICO", "ALIMENTO"]
csv_writer = DictWriter(txt_file, fieldnames=headers, delimiter=';')
csv_writer.writeheader()
for table_row in soup.find("table", id="tblResult").find_all("tr"):
className = ""
if table_row.get("class"):
className = table_row.get("class").pop()
if className == "testobold":
titolo_sezione = table_row.text
if className == "testonormale":
for cds in table_row.find_all("td"):
url = cds.get("a")
urls = requests.get("http://www.bda-ieo.it/test/Groupfood.aspx?Lan=Ita + url")
dage = BeautifulSoup(urls.text, "html.parser")
alimenti = ""
for alimenti in dage:
id_alimento, destra = alimenti.find_all("td")
codice = id_alimento.text
nome = destra.text
href = destra.a.get("href")
print(f'{titolo_sezione}; {id_alimento.text}; {nome.text}')
The variable urls doesn't open any further page. Somebody can help me to make it clear?
I am stuck on that.
Thank you
Mass
You need to re-work some of the logic in there, as well as read up a bit about string formatting. I made notes of where I made changes, and I'm not sure what exactly you are looking for as an output, but this may get you going.
from bs4 import BeautifulSoup
import requests
from csv import reader, writer, DictWriter, DictReader
source = requests.get("http://www.bda-ieo.it/test/Group.aspx?Lan=Ita")
soup = BeautifulSoup(source.text, "html.parser")
titolo_sezione = ""
table_row = ""
with open("c:/test/genere.txt", "w", newline="") as txt_file:
headers = ["GRUPPO MERCEOLOGICO", "CODICE MERCEOLOGICO", "ALIMENTO"]
csv_writer = DictWriter(txt_file, fieldnames=headers, delimiter=';')
csv_writer.writeheader()
for table_row in soup.find("table", id="tblResult").find_all("tr"):
className = ""
if table_row.get("class"):
className = table_row.get("class").pop()
if className == "testobold":
titolo_sezione = table_row.text
if className == "testonormale":
for cds in table_row.find_all("a", href=True): #<-- the hrefs are in the <a> tags within the <td> tags. So you need to find <a> tags that have href
url = cds['href'] #<--- get the href
urls = requests.get("http://www.bda-ieo.it/test/%s" %url) #<--- use that stored string to put into the new url you'll be using
dage = BeautifulSoup(urls.text, "html.parser") #<-- create BeautifulSoup object with that response
dageTbl = dage.find("table", id="tblResult") #<--- find the table in this html now
if dageTbl: #<--- if there is that table
for alimenti in dageTbl.find_all('tr', {'class':'testonormale'}): #<--- find the rows with the specific class
id_alimento, destra = alimenti.find_all("td")
codice = id_alimento.text
nome = destra.text.strip() #<--- added strip() to remove whitespace
href = destra.a.get("href")
print(f'{titolo_sezione}; {codice}; {nome}') #<--- fixed string formatting here too
Output:
PATATE; 381; PATATE
PATATE; 50399; PATATE DOLCI
PATATE; 380; PATATE NOVELLE
PATATE; 3002; PATATE, FECOLA
PATATE; 100219; PATATE, POLVERE ISTANTANEA
PATATE; 382; PATATINE IN SACCHETTO
PATATE; 18; TAPIOCA
VEGETALI; 303; ASPARAGI DI BOSCO
VEGETALI; 304; ASPARAGI DI CAMPO
VEGETALI; 305; ASPARAGI DI SERRA
VEGETALI; 700484; ASPARAGI IN SCATOLA
VEGETALI; 8035; GERMOGLI DI ERBA MEDICA
...
Following is the code. Basically, I am scraping movie info. from IMDB.com. But somehow the Request doesn't scrap the url, which is in the object "addr". The "print" I put into the parse_item2 simply does not show up.
This drives me crazy. I spent hours on it. Could anyone with some experience help? Thank you so much.
# code for the spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request, Response
from beta.items import BetaItem
import urllib2
class AlphaSpider(CrawlSpider):
name = 'alpha'
allowed_domains = ['amazon.com','imdb.com']
start_urls = ['http://www.imdb.com/search/title?at=0&sort=boxoffice_gross_us&title_type=feature&year=2005,2005']
rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//td/a',), allow=('/title/')), callback='parse_item1'),
)
def parse_item1(self, response):
sel = Selector(response)
item = BetaItem()
idb = sel.xpath('//link[#rel="canonical"]/#href').extract()
idb = idb[0].split('/')[-2]
item['idb'] = idb
title = sel.xpath('//h1[#class="header"]/span[#class="itemprop"]/text()').extract()
item['title'] = title
addr = 'http://www.imdb.com/title/' + idb + '/business'
request = Request(addr, callback=self.parse_item2)
request.meta['item'] = item
return request
def parse_item2(self, response):
print 'I am here'
item = response.meta['item']
sel = Selector(response)
# BLA BLA BLA
return item
The reason for the problem is indeed as Blender said in his comment above. It takes quite some time to crawl stuff for some particular request.
I am using python-twitter API v1.0. It says that it works with v1.1 of the twitter API.
This is my code :
import json
import urllib2
import urllib
import csv
file1 = open("somez.csv", "wb")
fname="tw1.txt"
file=open(fname,"r")
ins = open( "tw1.txt", "r" )
array = []
for line in ins:
array.append( line )
s = 'https://api.twitter.com/1.1/statuses/show/' + line[:-1] + '.json'
print s
try:
data=urllib2.urlopen(s)
except:
print "Not Found"
continue
print data
json_format=json.load(data)
js=json_format
print line[:-1]
print js[('user')]['id']
print js[('user')]['created_at']
print js['retweet_count']
print js['text']
# js = js.decode('utf-8')
one = line[:-1].encode('utf-8')
thr = js['user']['id']
two = js['user']['created_at'].encode('utf-8')
four = js['retweet_count']
five = js['text'].encode('utf-8')
rw = [one,two,thr,four,five];
spamWriter = csv.writer(file1 , delimiter=',')
spamWriter.writerow(rw)
file1.close()
I am not able to retrieve any data. It is saying "Not Found". When I open one of the URLs, I get this error:
{"errors":[{"message":"Bad Authentication data","code":215}]}
Can anyone suggest what the problem might be?