"invalid sequence" error in seqio.write() of biopython - biopython

This question is related to bioinformatics. I did not recieve any suggestions in corresponding forums, so I write it here.
I need to remove non-ACTG nucleotides in fasta file and write output to a new file using seqio from biopython.
My code is
import re
import sys
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
seq_list=[]
for seq_record in SeqIO.parse("test.fasta", "fasta",IUPAC.ambiguous_dna):
sequence=seq_record.seq
sequence=sequence.tomutable()
seq_record.seq = re.sub('[^GATC]',"",str(sequence).upper())
seq_list.append(seq_record)
SeqIO.write(seq_list,"test_out","fasta")
Running this code gives errors:
Traceback (most recent call last):
File "remove.py", line 18, in <module>
SeqIO.write(list,"test_out","fasta")
File "/home/ghovhannisyan/Software/anaconda2/lib/python2.7/site-packages/Bio/SeqIO/__init__.py", line 481, in write
count = writer_class(fp).write_file(sequences)
File "/home/ghovhannisyan/Software/anaconda2/lib/python2.7/site-packages /Bio/SeqIO/Interfaces.py", line 209, in write_file
count = self.write_records(records)
File "/home/ghovhannisyan/Software/anaconda2/lib/python2.7/site-packages/Bio/SeqIO/Interfaces.py", line 194, in write_records
self.write_record(record)
File "/home/ghovhannisyan/Software/anaconda2/lib/python2.7/site-packages/Bio/SeqIO/FastaIO.py", line 202, in write_record
data = self._get_seq_string(record) # Catches sequence being None
File "/home/ghovhannisyan/Software/anaconda2/lib/python2.7/site-packages/Bio/SeqIO/Interfaces.py", line 100, in _get_seq_string
% record.id)
TypeError: SeqRecord (id=CALB_TCONS_00001015) has an invalid sequence.
If I change this line
seq_record.seq = re.sub('[^GATC]',"",str(sequence).upper())
to for example seq_record.seq = sequence + "A" everything works fine. However, re.sub('[^GATC]',"",str(sequence).upper()) also should work in theory.
Thanks

Biopython's SeqIO expects the SeqRecord object's .seq to be a Seq object (or similar), not a plain string. Try:
seq_record.seq = Seq(re.sub('[^GATC]',"",str(sequence).upper()))
For FASTA output there is no need to set the sequence's alphabet.

Related

Error "StreamModeError: Fasta files must be opened in text mode" in BioPython on a Streamlit App

Please Assist, How can I resolve this Error?
I am building a streamlit app that take uploaded FASTA Files as input and read them.
I am getting the error below when I try to read an uploaded fasta file on my streamlit app
THE ERROR:
StreamModeError: Fasta files must be opened in text mode.
Traceback:
File "C:\Users\Sir Roberto\AppData\Local\Programs\Python\Python310\lib\site-packages\streamlit\scriptrunner\script_runner.py", line 475, in _run_script
exec(code, module.__dict__)
File "C:\Users\Sir Roberto\PycharmProjects\SARS_CoV_2_Mutation_Forecasting_GUI\SARS_CoV_2_Mutation_Forecasting_GUI.py", line 45, in <module>
main()
File "C:\Users\Sir Roberto\PycharmProjects\SARS_CoV_2_Mutation_Forecasting_GUI\SARS_CoV_2_Mutation_Forecasting_GUI.py", line 21, in main
protein_sample = SeqIO.read(seq_file, 'fasta')
File "C:\Users\Sir Roberto\AppData\Local\Programs\Python\Python310\lib\site-packages\Bio\SeqIO\__init__.py", line 652, in read
iterator = parse(handle, format, alphabet)
File "C:\Users\Sir Roberto\AppData\Local\Programs\Python\Python310\lib\site-packages\Bio\SeqIO\__init__.py", line 605, in parse
return iterator_generator(handle)
File "C:\Users\Sir Roberto\AppData\Local\Programs\Python\Python310\lib\site-packages\Bio\SeqIO\FastaIO.py", line 183, in __init__
super().__init__(source, mode="t", fmt="Fasta")
File "C:\Users\Sir Roberto\AppData\Local\Programs\Python\Python310\lib\site-packages\Bio\SeqIO\Interfaces.py", line 53, in __init__
raise StreamModeError(
THE CODE GIVING THE ERROR
import streamlit as st
import tensorflow as tf
import tensorflow_io as tfio
from Bio import SeqIO
def main():
st.title('Covid-19 Mutation Forecasting App')
menu = ['Forecast Mutation', 'About The App']
choice = st.sidebar.selectbox('Select Activity', menu)
if choice == 'Forecast Mutation':
st.subheader('Mutation Forecasting Workspace')
seq_file = st.file_uploader('Upload a Sequence File:', type=['fasta'])
if seq_file is not None:
protein_sample = SeqIO.read(seq_file, 'fasta')
st.write(protein_sample)
loaded_model = tf.keras.models.load_model("")
next_step = st.checkbox('Forecast')
if next_step:
states = None
next_char = tfio.genome.read_fastaq(protein_sample)
result = [next_char]
for n in range(100):
next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
result.append(next_char)
print(tf.strings.join(result)[0].numpy().decode("utf-8"))
else:
st.subheader('About The App')
st.caption('Given the first few codons of a SARS-CoV-2 Spike Protein, '
'the App forecasts and display the complete sequence of the mutant')
if __name__ == '__main__':
main()`
The following code should get you started:
import streamlit as st
import tensorflow as tf
import tensorflow_io as tfio
from Bio import SeqIO
from io import StringIO
def main():
st.title('Covid-19 Mutation Forecasting App')
menu = ['Forecast Mutation', 'About The App']
choice = st.sidebar.selectbox('Select Activity', menu)
if choice == 'Forecast Mutation':
st.subheader('Mutation Forecasting Workspace')
seq_file = st.file_uploader('Upload a Sequence File:', type=['fasta'])
if seq_file is not None:
# To convert to a string based IO:
stringio = StringIO(seq_file.getvalue().decode("utf-8"))
for record in SeqIO.parse(stringio, 'fasta'):
sequence = str(record.seq)
st.write(f'Length of sequence: {len(sequence)}')
# The unique characters in the FASTA file
vocab = sorted(set(sequence))
st.write(f"{len(vocab)} unique characters: {', '.join(vocab)}")
tensor = tfio.genome.sequences_to_onehot(sequence)
st.write(tensor)
else:
st.subheader('About The App')
st.caption('Given the first few codons of a SARS-CoV-2 Spike Protein, '
'the App forecasts and display the complete sequence of the mutant')
if __name__ == '__main__':
main()
It looks as follows:
I used tfio.genome.sequence_to_onehot instead of tfio.genome.read_fastaq, because you are reading a FASTA file, not a FASTQ file.

How to address a Snakemake error at the stage of DAG computation?

I'm running in to an error with my Snakemake variant identification pipeline, when the original DAG of jobs is built. I believe this is a memory issue; when I test with a short list of input files, the DAG is constructed without issue, however, when I try with 300+ input paired-fastq, I receive the following error:
Building DAG of jobs...
Traceback (most recent call last):
File "/home//.conda/envs/snakemake/lib/python3.6/site-packages/snakemake/__init__.py", line 633, in snakemake
keepincomplete=keep_incomplete,
File "/home//.conda/envs/snakemake/lib/python3.6/site-packages/snakemake/workflow.py", line 568, in execute
dag.check_incomplete()
File "/home//.conda/envs/snakemake/lib/python3.6/site-packages/snakemake/dag.py", line 281, in check_incomplete
incomplete = self.incomplete_files
File "/home//.conda/envs/snakemake/lib/python3.6/site-packages/snakemake/dag.py", line 402, in incomplete_files
filterfalse(self.needrun, self.jobs),
File "/home/k/.conda/envs/snakemake/lib/python3.6/site-packages/snakemake/dag.py", line 399, in <genexpr>
job.output
File "/home//.conda/envs/snakemake/lib/python3.6/site-packages/snakemake/persistence.py", line 205, in incomplete
return any(map(lambda f: f.exists and marked_incomplete(f), job.output))
File "/home//.conda/envs/snakemake/lib/python3.6/site-packages/snakemake/persistence.py", line 205, in <lambda>
return any(map(lambda f: f.exists and marked_incomplete(f), job.output))
File "/home//.conda/envs/snakemake/lib/python3.6/site-packages/snakemake/persistence.py", line 203, in marked_incomplete
return self._read_record(self._metadata_path, f).get("incomplete", False)
File "/home//.conda/envs/snakemake/lib/python3.6/site-packages/snakemake/persistence.py", line 322, in _read_record_cached
return self._read_record_uncached(subject, id)
File "/home//.conda/envs/snakemake/lib/python3.6/site-packages/snakemake/persistence.py", line 328, in _read_record_uncached
return json.load(f)
File "/home//.conda/envs/snakemake/lib/python3.6/json/__init__.py", line 299, in load
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File "/home//.conda/envs/snakemake/lib/python3.6/json/__init__.py", line 354, in loads
return _default_decoder.decode(s)
File "/home//.conda/envs/snakemake/lib/python3.6/json/decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/home//.conda/envs/snakemake/lib/python3.6/json/decoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
I'm not sure how to resolve this - if this is a known bug or if there is a way to define my pipeline to build a less complex DAG? I am including the first section of my Snakemake file as well. I use the rule all to define all desired output files.
################################
#### Mtb bwa/GATK Snakemake ####
################################
import numpy as np
from collections import defaultdict
import pandas as pd
samples_df = pd.read_table('config/tgen_samples2a.tsv',sep = ',').set_index("sample", drop=False)
sample_names = list(samples_df['sample'])
batch_names = list(samples_df['batch'])
#print(sample_names)
# fastq1 input function definition
def fq1_from_sample(wildcards):
return samples_df.loc[wildcards.sample, "fastq_1"]
# fastq2 input function definition
def fq2_from_sample(wildcards):
return samples_df.loc[wildcards.sample, "fastq_2"]
# Define config file. Stores sample names and other things.
configfile: "config/config.yaml"
# Define a rule for running the complete pipeline.
rule all:
wildcard_constraints:
batch="IS-.+"
input:
trim = expand(['results/{batch}/{samp}/trim/{samp}_trim_1.fq.gz'], zip, samp=sample_names,batch=batch_names),
kraken=expand('results/{batch}/{samp}/kraken/{samp}_trim_kr_1.fq.gz', zip, samp=sample_names,batch=batch_names),
bams=expand('results/{batch}/{samp}/bams/{samp}_{mapper}_{ref}_sorted.bam', zip, samp=sample_names,batch=batch_names, ref = config['ref']*len(sample_names), mapper = config['mapper']*len(sample_names)), # When using zip, need to use vectors of equal lengths for all wildcards.
per_samp_run_stats = expand('results/{batch}/{samp}/stats/{samp}_{mapper}_{ref}_combined_stats.csv', zip, samp=sample_names,batch=batch_names, ref = config['ref']*len(sample_names), mapper = config['mapper']*len(sample_names)),
amr_stats=expand('results/{batch}/{samp}/stats/{samp}_{mapper}_{ref}_amr.csv', samp=sample_names,batch=batch_names, ref=config['ref'], mapper=config['mapper']),
cov_stats=expand('results/{batch}/{samp}/stats/{samp}_{mapper}_{ref}_cov_stats.txt', samp=sample_names,batch=batch_names, ref=config['ref'], mapper=config['mapper']),
all_sample_stats=expand('results/{batch}/stats/combined_per_run_sample_stats.csv',batch = batch_names),
vcfs=expand('results/{batch}/{samp}/vars/{samp}_{mapper}_{ref}_{caller}_qfilt.vcf.gz', samp=sample_names,batch=batch_names, ref=config['ref'], mapper=config['mapper'], caller = config['caller']),
ann_vcfs=expand('results/{batch}/{samp}/vars/{samp}_{mapper}_{ref}_gatk_ann.vcf.gz', samp=sample_names,batch=batch_names, ref=config['ref'], mapper=config['mapper'], caller = config['caller']),
fastas=expand('results/{batch}/{samp}/fasta/{samp}_{mapper}_{ref}_{caller}_{filter}.fa', samp=sample_names,batch=batch_names, ref=config['ref'], mapper=config['mapper'], caller = config['caller'], filter=config['filter']),
profiles=expand('results/{batch}/{samp}/stats/{samp}_{mapper}_{ref}_lineage.csv', samp=sample_names,batch=batch_names, ref=config['ref'], mapper=config['mapper'])
# Trim reads for quality.
rule trim_reads:
input:
p1=fq1_from_sample,
p2=fq2_from_sample
output:
trim1='results/{batch}/{sample}/trim/{sample}_trim_1.fq.gz',
trim2='results/{batch}/{sample}/trim/{sample}_trim_2.fq.gz'
log:
'results/{batch}/{sample}/trim/{sample}_trim_reads.log'
shell:
'{config[scripts_dir]}trim_reads.sh {input.p1} {input.p2} {output.trim1} {output.trim2} &>> {log}'
# Filter reads taxonomically with Kraken.
rule taxonomic_filter:
input:
trim1='results/{batch}/{samp}/trim/{samp}_trim_1.fq.gz',
trim2='results/{batch}/{samp}/trim/{samp}_trim_2.fq.gz'
output:
kr1='results/{batch}/{samp}/kraken/{samp}_trim_kr_1.fq.gz',
kr2='results/{batch}/{samp}/kraken/{samp}_trim_kr_2.fq.gz',
kraken_report='results/{batch}/{samp}/kraken/{samp}_kraken.report',
kraken_stats = 'results/{batch}/{samp}/kraken/{samp}_kraken_stats.csv'
log:
'results/{batch}/{samp}/kraken/{samp}_kraken.log'
threads: 8
shell:
'{config[scripts_dir]}run_kraken.sh {input.trim1} {input.trim2} {output.kr1} {output.kr2} {output.kraken_report} &>> {log}'
Thank you in advance for help using Snakemake!
All the best,
I kind of doubt memory is an issue. 300+ is not much, especially if each of them is processed independently of the others.
Try to start from the subset of samples that you say worked and gradually increase it until you see the problem appearing. Perhaps you have some funny value in your sample sheet or in your config? json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) hints at something like that in my impression.
The answer was from #TroyComi, above: after deleting the .snakemake directory, the issue was resolved. Thank you!

Getting error "AttributeError: module "ibapi.contract" has no attribute "UnderComp"

Ive tried now for some time to get the following code to work, but I keep getting this error message. What am I doing wrong?
from ib_insync import IB
ib = IB()
ib.connect("127.0.0.1",7497,clientId=1)
stock = Stock("AMD","SMART","USD")
bars = ib.reqHistoricalData(
stock,
endDateTime="",
durationStr="30 D",
barSizeSetting="1 hour",
whatToShow="MIDPOINT",
useRTH="True"
)
print(bars)
Error message:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "c:/Users/Ejer/Desktop/TWS/option.py", line 1, in <module>
from ib_insync import IB
File "c:\Users\Ejer\Miniconda3\lib\site-packages\ib_insync\__init__.py", line 21, in <module>
from .objects import *
File "c:\Users\Ejer\Miniconda3\lib\site-packages\ib_insync\objects.py", line 155, in <module>
class UnderComp(Object):
File "c:\Users\Ejer\Miniconda3\lib\site-packages\ib_insync\objects.py", line 156, in UnderComp
defaults = ibapi.contract.UnderComp().__dict__
AttributeError: module 'ibapi.contract' has no attribute 'UnderComp'
>>>
Seems, you are using the old ib-insync package (ver 0.9.11). Try to install the latest version ib-insync 0.9.64, that worked for me.
Also, follow the group for more information: https://groups.io/g/insync

How to fine tune niftynet pre trained model for custom data

I want to use niftynet pretrained segmentation model for segmenting custom data. I downloaded the pre trained weights and and modified model_dir path to downloaded one.
However when I run
python3 net_segment.py train -c /home/Container_data/config/promise12_demo_train_config.ini
I am getting the error below.
Caused by op 'save/Assign_17', defined at:
File "net_segment.py", line 8, in <module>
sys.exit(main())
File "/home/NiftyNet/niftynet/__init__.py", line 142, in main
app_driver.run(app_driver.app)
File "/home/NiftyNet/niftynet/engine/application_driver.py", line 197, in run
SESS_STARTED.send(application, iter_msg=None)
File "/usr/local/lib/python3.5/dist-packages/blinker/base.py", line 267, in send
for receiver in self.receivers_for(sender)]
File "/usr/local/lib/python3.5/dist-packages/blinker/base.py", line 267, in <listcomp>
for receiver in self.receivers_for(sender)]
File "/home/NiftyNet/niftynet/engine/handler_model.py", line 109, in restore_model
var_list=to_restore, save_relative_paths=True)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1102, in __init__
self.build()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1114, in build
self._build(self._filename, build_save=True, build_restore=True)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1151, in _build
build_save=build_save, build_restore=build_restore)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 795, in _build_internal
restore_sequentially, reshape)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 428, in _AddRestoreOps
assign_ops.append(saveable.restore(saveable_tensors, shapes))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 119, in restore
self.op.get_shape().is_fully_defined())
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/state_ops.py", line 221, in assign
validate_shape=validate_shape)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_state_ops.py", line 61, in assign
use_locking=use_locking, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
op_def=op_def)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
self._traceback = tf_stack.extract_stack()
InvalidArgumentError (see above for traceback): Restoring from checkpoint failed. This is most likely due to a mismatch between the current graph and the graph from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:
Assign requires shapes of both tensors to match. lhs shape= [3,3,61,256] rhs shape= [3,3,3,61,9]
[[node save/Assign_17 (defined at /home/NiftyNet/niftynet/engine/handler_model.py:109) = Assign[T=DT_FLOAT, _class=["loc:#DenseVNet/conv/conv_/w"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](DenseVNet/conv/conv_/w, save/RestoreV2/_35)
https://github.com/tensorflow/models/issues/5390
Above link says to add
--initialize_last_layer = False
--last_layers_contain_logits_only = False
Can some one help me how to get rid of this error.
It seems you are having problems with your last layer. When you use a pretrained model on a new task you probably need to change your last layer to fit your new requirements.
In order to do that you should modify your config file by restoring all vars but last layer:
vars_to_restore = ^((?!(last_layer_name)).)*$
and then set num_classes to suit your new segmentation problem.
You can check transfer learning docs here: https://niftynet.readthedocs.io/en/dev/transfer_learning.html

Problems with OAuth2 and gspread

I've had some working api code for quite a long time but suddenly (about 30 minutes from the previous use of the api) it's stopped working
here's the traceback
row_cells = self.range('%s:%s' % (start_cell, end_cell))
File"/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/gspread/models.py", line 72, in wrapper
return method(self, *args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/gspread/models.py", line 412, in range
params={'range': name, 'return-empty': 'true'}
File "/Lbrary/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/gspread/client.py", line 176, in get_cells_feed
r = self.session.get(url)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/gspread/httpsession.py", line 73, in get
return self.request('GET', url, params=params, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/gspread/httpsession.py", line 69, in request
response.status_code, response.content))
gspread.exceptions.RequestError: (401, '401: b\'<HTML>\\n<HEAD>\\n<TITLE>Unauthorized</TITLE>\\n</HEAD>\\n<BODY BGCOLOR="#FFFFFF" TEXT="#000000">\\n<H1>Unauthorized</H1>\\n<H2>Error 401</H2>\\n</BODY>\\n</HTML>\\n\'')
And I don't really understand this...
here's the code
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pprint
scope = [ 'https://spreadsheets.google.com/feeds' ]
creds = ServiceAccountCredentials.from_json_keyfile_name('client_secret.json', scope)
client = gspread.authorize(creds)
sheet = client.open('sheet_name').sheet1
I really don't know what to do, I've already created new api's email address, downloaded the json file (client_secret.json) but it still didn't get back to working and I honestly don't know why

Resources