BioPython ValueError: alphabet is None; cannot interpret sequence - biopython

So I'm getting this error message when trying to align two SeqRecord objects.
Why is this error happening?
Here is the code (reference_genome is defined earlier in the code)
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import Align
from Bio.SeqRecord import SeqRecord
d122_fragment1 = SeqRecord(Seq('CAGTGGAAATGAAAGTGATGGGGACACAAATGAATTGCTCGCACTTATGGAAATGGGGAACTTTGATCCTTGGATTGGTGATAATTTGTAGTGCCTCAAACAACTTGTGGGTTACAGTTTATTATGGGGTTCCTGTGTGGAGAGATGCAGATACCACCCTCTTTTGTGCATCAGATGCTAAAGCACATAAGACAGAAGTGCATAATGTCTGGGCTACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAATACACCTGGGAAATGTAACAGAAGATTTTAACATGTG'), id='p122_1', annotations={"molecule_type": "DNA"})
Seq_ref_genome = SeqRecord(Seq(reference_genome), id = 'ref', annotations={"molecule_type": "DNA"})
aligner = Align.PairwiseAligner()
alignments = aligner.align(Seq_ref_genome, d122_fragment1)
alignment = alignments[0]
print(alignment)

This works with Seq objects, not records. For records alignment you can use pairwise2 module.
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import Align
from Bio.SeqRecord import SeqRecord
d122_fragment1 = SeqRecord(Seq('CAGTGGAAATGAAAGTGATGGGGACACAAATGAATTGCTCGCACTTATGGAAATGGGGAACTTTGATCCTTGGATTGGTGATAATTTGTAGTGCCTCAAACAACTTGTGGGTTACAGTTTATTATGGGGTTCCTGTGTGGAGAGATGCAGATACCACCCTCTTTTGTGCATCAGATGCTAAAGCACATAAGACAGAAGTGCATAATGTCTGGGCTACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAATACACCTGGGAAATGTAACAGAAGATTTTAACATGTG'), id='p122_1', annotations={"molecule_type": "DNA"})
Seq_ref_genome = SeqRecord(Seq(reference_genome), id = 'ref', annotations={"molecule_type": "DNA"})
aligner = Align.PairwiseAligner()
alignments = aligner.align(Seq_ref_genome.seq, d122_fragment1.seq)
alignment = alignments[0]
print(alignment)

Related

am getting NotFittedError: This MultinomialNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator

my code is
import streamlit as st
import pickle
import string
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def transform_text(text):
text = text.lower()
text = nltk.word_tokenize(text)
y = []
for i in text:
if i.isalnum():
y.append(i)
text = y[:]
y.clear()
for i in text:
if i not in stopwords.words('english') and i not in string.punctuation:
y.append(i)
text = y[:]
y.clear()
for i in text:
y.append(ps.stem(i))
return " ".join(y)
tfidf = pickle.load(open('vectorizer.pkl','rb'))
model = pickle.load(open('model.pkl','rb'))
st.title("Email/SMS Spam Classifier")
input_sms = st.text_area("Enter the message")
if st.button('Predict'):
# 1. preprocess
transformed_sms = transform_text(input_sms)
# 2. vectorize
vector_input = tfidf.transform([transformed_sms])
# 3. predict
result = model.predict(vector_input)[0]
# 4. Display
if result == 1:
st.header("Spam")
else:
st.header("Not Spam")

Is there a function to get log and pct change?

I would like to compare log and pct change f the two symbols, but the following error appears:
KeyError: 'Adj Close'
import datetime
import pandas as pd
import numpy as np
import yfinance as yf
start = datetime.datetime(2017, 10, 1)
end = datetime.datetime.now()
symbols = ['BTC-USD', 'ETH-USD']
df = pd.DataFrame()
for i in symbols:
data = yf.download(i, start=None, end=None,show_errors=("True"),
period="4y", interval="1mo")
df[i] = data['Adj Close'].pct_change().dropna()
df['log_stuff'] = \
np.log(df['Adj Close'].astype('float64')/df['Adj Close'].astype('float64').shift(1))
df[['pct_change', 'log_stuff','df']].plot();
You could try the following. Please note, that you can also pass a list to download(), so no loops are required.
import numpy as np
import pandas as pd
import yfinance as yf
symbols = ['BTC-USD', 'ETH-USD']
data = yf.download(symbols, period="4y", interval="1mo")
# calculate pct return
pct_data = data['Adj Close'].pct_change()
pct_data = pct_data.add_suffix('_pct')
# calculate log returns
log_data = np.log(data['Adj Close']) - np.log(data['Adj Close'].shift(1))
log_data = log_data.add_suffix('_log')
# combine returns and drop na values
combined_data = pd.concat([pct_data,log_data], axis=1).dropna()
print(combined_data)
This will yield the following output:
BTC-USD_pct ETH-USD_pct BTC-USD_log ETH-USD_log
Date
2017-12-01 0.383326 0.692483 0.324490 0.526197
2018-01-01 -0.277987 0.477813 -0.325713 0.390564
2018-02-01 0.017298 -0.235276 0.017150 -0.268240
...

How to download _full_ RefSeq record using Efetch?

I have a problem downloading a full record from Nucleotide db.
I use:
from Bio import Entrez
from Bio import SeqIO
with Entrez.efetch(db="nuccore", rettype="gb", retmode="full", id="NC_007384") as handle:
seq_record = SeqIO.read(handle, "gb")
print(seq_record)
which gives me a short version of gb file so the command:
seq_record.features
does not return features.
In comparison, there is no problem when I do the same thing with GenBank ID:
with Entrez.efetch(db="nuccore", rettype="gb", retmode="full", id="CP014768.1") as handle:
seq_record = SeqIO.read(handle, "gb")
print(seq_record)
After that I can extract every annotated feature from the list seq_record.features.
Is there a way to download full RefSeq records using Efetch?
You need to either use style="withparts" or change rettype to gbwithparts to fetch all of the features. This table has some information.
>>> from Bio import Entrez
>>> from Bio import SeqIO
>>> Entrez.email = 'someone#email.com'
>>> with Entrez.efetch(db="nuccore", rettype="gb", retmode="full", id="NC_007384") as handle:
... seq_record = SeqIO.read(handle, "gb")
...
>>> len(seq_record.features)
1
>>> with Entrez.efetch(db="nuccore", rettype="gbwithparts", retmode="full", id="NC_007384") as handle:
... seq_record = SeqIO.read(handle, "gb")
...
>>> len(seq_record.features)
10616
>>> with Entrez.efetch(db="nuccore", rettype="gb", style="withparts", retmode="full", id="NC_007384") as handle:
... seq_record = SeqIO.read(handle, "gb")
...
>>> len(seq_record.features)
10616

NLTK Word Extraction

So I am trying to read a txt file, process it by taking out the stop words, and then output that result into a new file. However, I keep getting the following error:
TypeError: expected a string or other character buffer object
This is my code:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
f=open('tess1.txt','rU')
stop_words = set(stopwords.words('english'))
raw=f.read()
word_tokens = word_tokenize(raw)
text = nltk.Text(word_tokens)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
if w not in stop_words:
filtered_sentence.append(w)
K = open("tess12.txt", "w")
K.write(filtered_sentence)
K.close()
print(filtered_sentence)
The solution's to write a string inside the buffer:
K.write(str(filtered_sentence))

How to convert a text file by word2vec using python

I am beginner of python language,natural language processing,deep learning,neural networks.I want to execute a program which convert text file into vector by using word2vec in python..someone please help me
import math
import nltk file = "/home/stephy/Demo/textfile.txt"
import numpy as np
def loadGloveModel(gloveFile):
with open(gloveFile, encoding="utf8" ) as f:
content = f.readlines()
model = {} for line in content:
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
print ("Done.",len(model)," words loaded!")
return model
model= loadGloveModel(file)
print (model['file'])

Resources