How to download _full_ RefSeq record using Efetch? - biopython

I have a problem downloading a full record from Nucleotide db.
I use:
from Bio import Entrez
from Bio import SeqIO
with Entrez.efetch(db="nuccore", rettype="gb", retmode="full", id="NC_007384") as handle:
seq_record = SeqIO.read(handle, "gb")
print(seq_record)
which gives me a short version of gb file so the command:
seq_record.features
does not return features.
In comparison, there is no problem when I do the same thing with GenBank ID:
with Entrez.efetch(db="nuccore", rettype="gb", retmode="full", id="CP014768.1") as handle:
seq_record = SeqIO.read(handle, "gb")
print(seq_record)
After that I can extract every annotated feature from the list seq_record.features.
Is there a way to download full RefSeq records using Efetch?

You need to either use style="withparts" or change rettype to gbwithparts to fetch all of the features. This table has some information.
>>> from Bio import Entrez
>>> from Bio import SeqIO
>>> Entrez.email = 'someone#email.com'
>>> with Entrez.efetch(db="nuccore", rettype="gb", retmode="full", id="NC_007384") as handle:
... seq_record = SeqIO.read(handle, "gb")
...
>>> len(seq_record.features)
1
>>> with Entrez.efetch(db="nuccore", rettype="gbwithparts", retmode="full", id="NC_007384") as handle:
... seq_record = SeqIO.read(handle, "gb")
...
>>> len(seq_record.features)
10616
>>> with Entrez.efetch(db="nuccore", rettype="gb", style="withparts", retmode="full", id="NC_007384") as handle:
... seq_record = SeqIO.read(handle, "gb")
...
>>> len(seq_record.features)
10616

Related

unable take user input from python file through jenkins pipeline

import os
import sys
import json
import requests
import isodate
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
bufsize=1024
session = requests.Session()
session.trust_env=False
path = os.getcwd()
print(path)
format(os.getcwd())
os.chdir('/raj')
file_name = pd.read_excel('repos_desc12_p61qr.xlsx')
file2 = open("commit_details.csv", "w+", buffering=bufsize)
file3 = open("merge_details.csv", "w+", buffering=bufsize)
hostname = "https://bsp-os.git.visteon.com"
private = "-rBpd_x15GRTmFkk_T9H"
def excel_parser(meExcel):
dict_format = meExcel.to_dict(orient='record')
#print(dict_format.columns.ravel())
#dict_format = json.loads(dict_format)
#print(dict_format)
for repo_detail in dict_format:
parsed_repo_path = repo_detail["REPO"]
#print(parsed_repo_path)
parsed_branch_name = repo_detail["BranchName"]
#print(parsed_branch_name)
parsed_duration = repo_detail["StartDate"]
while am trying to run and take input through pipeline , the " EOFError: EOF when reading a line " is occuring i tried error exception but not working please help to get the input from python file through jenkins pipeline

BioPython ValueError: alphabet is None; cannot interpret sequence

So I'm getting this error message when trying to align two SeqRecord objects.
Why is this error happening?
Here is the code (reference_genome is defined earlier in the code)
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import Align
from Bio.SeqRecord import SeqRecord
d122_fragment1 = SeqRecord(Seq('CAGTGGAAATGAAAGTGATGGGGACACAAATGAATTGCTCGCACTTATGGAAATGGGGAACTTTGATCCTTGGATTGGTGATAATTTGTAGTGCCTCAAACAACTTGTGGGTTACAGTTTATTATGGGGTTCCTGTGTGGAGAGATGCAGATACCACCCTCTTTTGTGCATCAGATGCTAAAGCACATAAGACAGAAGTGCATAATGTCTGGGCTACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAATACACCTGGGAAATGTAACAGAAGATTTTAACATGTG'), id='p122_1', annotations={"molecule_type": "DNA"})
Seq_ref_genome = SeqRecord(Seq(reference_genome), id = 'ref', annotations={"molecule_type": "DNA"})
aligner = Align.PairwiseAligner()
alignments = aligner.align(Seq_ref_genome, d122_fragment1)
alignment = alignments[0]
print(alignment)
This works with Seq objects, not records. For records alignment you can use pairwise2 module.
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import Align
from Bio.SeqRecord import SeqRecord
d122_fragment1 = SeqRecord(Seq('CAGTGGAAATGAAAGTGATGGGGACACAAATGAATTGCTCGCACTTATGGAAATGGGGAACTTTGATCCTTGGATTGGTGATAATTTGTAGTGCCTCAAACAACTTGTGGGTTACAGTTTATTATGGGGTTCCTGTGTGGAGAGATGCAGATACCACCCTCTTTTGTGCATCAGATGCTAAAGCACATAAGACAGAAGTGCATAATGTCTGGGCTACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAATACACCTGGGAAATGTAACAGAAGATTTTAACATGTG'), id='p122_1', annotations={"molecule_type": "DNA"})
Seq_ref_genome = SeqRecord(Seq(reference_genome), id = 'ref', annotations={"molecule_type": "DNA"})
aligner = Align.PairwiseAligner()
alignments = aligner.align(Seq_ref_genome.seq, d122_fragment1.seq)
alignment = alignments[0]
print(alignment)

Is there a function to get log and pct change?

I would like to compare log and pct change f the two symbols, but the following error appears:
KeyError: 'Adj Close'
import datetime
import pandas as pd
import numpy as np
import yfinance as yf
start = datetime.datetime(2017, 10, 1)
end = datetime.datetime.now()
symbols = ['BTC-USD', 'ETH-USD']
df = pd.DataFrame()
for i in symbols:
data = yf.download(i, start=None, end=None,show_errors=("True"),
period="4y", interval="1mo")
df[i] = data['Adj Close'].pct_change().dropna()
df['log_stuff'] = \
np.log(df['Adj Close'].astype('float64')/df['Adj Close'].astype('float64').shift(1))
df[['pct_change', 'log_stuff','df']].plot();
You could try the following. Please note, that you can also pass a list to download(), so no loops are required.
import numpy as np
import pandas as pd
import yfinance as yf
symbols = ['BTC-USD', 'ETH-USD']
data = yf.download(symbols, period="4y", interval="1mo")
# calculate pct return
pct_data = data['Adj Close'].pct_change()
pct_data = pct_data.add_suffix('_pct')
# calculate log returns
log_data = np.log(data['Adj Close']) - np.log(data['Adj Close'].shift(1))
log_data = log_data.add_suffix('_log')
# combine returns and drop na values
combined_data = pd.concat([pct_data,log_data], axis=1).dropna()
print(combined_data)
This will yield the following output:
BTC-USD_pct ETH-USD_pct BTC-USD_log ETH-USD_log
Date
2017-12-01 0.383326 0.692483 0.324490 0.526197
2018-01-01 -0.277987 0.477813 -0.325713 0.390564
2018-02-01 0.017298 -0.235276 0.017150 -0.268240
...

AttributeError: 'list' object has no attribute 'tolist'

It's a two part question,
import face_recognition
import os
import json
loadarr=[]
encodearr=[]
for i in range(0, 4):
loadarr.append(face_recognition.load_image_file( "brad"+str(i+1)+".jpg"))
encodearr.append(face_recognition.face_encodings(loadarr[i])[0])
encodearr = encodearr.tolist()
# print(encodearr)
encodedDic = {"des": encodearr}
with open("sample.json", "w") as outfile:
json.dump(encodedDic,outfile)
When I tried to convert the list encodearr as value of the key "des" (without .tolist()) it shows
TypeError: Object of type ndarray is not JSON serializable .Then I added .tolist() to encode arr as show. it shows AttributeError: 'list' object has no attribute 'tolist', brad1 to brad5 are the jpg files in the directory.
I did a workaround using numpy.
import face_recognition
import os
import json
import numpy as np
encodearr=[]
for i in range(0, 4):
load=face_recognition.load_image_file( "brad"+str(i+1)+".jpg")
encodearr.append(face_recognition.face_encodings(loadarr)[0])
reshapped_array = np.reshape(encodearr,(total_images,128) //each image is an array consisting 128 images
encodedDic = {"des": reshapped_array }
with open("sample.json", "w") as outfile:
json.dump(encodedDic,outfile)

NLTK Word Extraction

So I am trying to read a txt file, process it by taking out the stop words, and then output that result into a new file. However, I keep getting the following error:
TypeError: expected a string or other character buffer object
This is my code:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
f=open('tess1.txt','rU')
stop_words = set(stopwords.words('english'))
raw=f.read()
word_tokens = word_tokenize(raw)
text = nltk.Text(word_tokens)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
if w not in stop_words:
filtered_sentence.append(w)
K = open("tess12.txt", "w")
K.write(filtered_sentence)
K.close()
print(filtered_sentence)
The solution's to write a string inside the buffer:
K.write(str(filtered_sentence))

Resources