Key error in biopython while creating motif

Key error in biopython while creating motif - biopython

I am trying generate a weblogo for the protein sequences provided. The following is my code:
from Bio.Seq import Seq
from Bio import motifs
from Bio.Alphabet import generic_protein
instances = [Seq("RWST"),
Seq("RTAG"),
Seq("RQGC"),
Seq("RMAA"),
]
m = motifs.create(instances)
m.weblogo("mymotif.png")
I get the following error:
counts[letter][position] += 1
KeyError: 'R'
Full stack trace:
<ipython-input-3-ee8922743152> in <module>()
10
11
---> 12 m = motifs.create(instances)
13 m.weblogo("mymotif.png")
lib/site-packages/Bio/motifs/__init__.py in create(instances, alphabet)
21 def create(instances, alphabet=None):
22 instances = Instances(instances, alphabet)
---> 23 return Motif(instances=instances, alphabet=alphabet)
24
25
lib/site-packages/Bio/motifs/__init__.py in __init__(self, alphabet, instances, counts)
236 self.instances = instances
237 alphabet = self.instances.alphabet
--> 238 counts = self.instances.count()
239 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts)
240 self.length = self.counts.length
lib/site-packages/Bio/motifs/__init__.py in count(self)
192 for instance in self:
193 for position, letter in enumerate(instance):
--> 194 counts[letter][position] += 1
195 return counts
196
KeyError: 'R'

Motif takes an alphabet as a keyword (named) argument, so does motifs.create. If there is none, BioPython assumes the sequence is a DNA and in your case R is not found in the alphabet.
For your example you would need to use IUPAC.protein to make it work.
Note: BioPython uses letters internally to see which characters are available, genericProtein has no letters.
from Bio import motifs
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
instances = [Seq("RWST", IUPAC.protein),
Seq("RTAG", IUPAC.protein),
Seq("RQGC", IUPAC.protein),
Seq("RMAA", IUPAC.protein),
]
m = motifs.create(instances, IUPAC.protein)
m.weblogo("mymotif.png")

Related

How do i fix this error TypeError: '>=' not supported between instances of 'int' and 'str'

I am trying and testing an outbreak prediction model; but I am facing an error:
TypeError Traceback (most recent call last)
Cell In [12], line 7
5 for i in [mea,mum,per,pol,rub,sma]:
6 train_data = train_data.append(i)
----> 7 train_data = train_data.loc[(train_data['Year'] >= '1960') & (train_data['Year'] <= '2011')]
9 # examine the dataset
10 train_data.head()
File c:\Users\Alkaejah\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\ops\common.py:72, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
68 return NotImplemented
70 other = item_from_zerodim(other)
---> 72 return method(self, other)
File c:\Users\Alkaejah\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arraylike.py:62, in OpsMixin.__ge__(self, other)
60 #unpack_zerodim_and_defer("__ge__")
61 def __ge__(self, other):
---> 62 return self._cmp_method(other, operator.ge)
File c:\Users\Alkaejah\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\series.py:6243, in Series._cmp_method(self, other, op)
6240 rvalues = extract_array(other, extract_numpy=True, extract_range=True)
6242 with np.errstate(all="ignore"):
-> 6243 res_values = ops.comparison_op(lvalues, rvalues, op)
6245 return self._construct_result(res_values, name=res_name)
...
35 typ = type(right).__name__
---> 36 raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}")
37 return res_values
TypeError: Invalid comparison between dtype=int64 and str
SOURCE CODE:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import calendar
import numpy as np
def week2month(hep):
hep['LastDayWeek'] = pd.to_datetime((hep['week']-1).astype(str) + "6", format="%Y%U%w")
hep['MonthMax'] = pd.DatetimeIndex(hep['LastDayWeek']).month
hep['Year'] = pd.DatetimeIndex(hep['LastDayWeek']).year
hep['MonthName'] = [calendar.month_name[i] for i in hep.MonthMax]
def get_input_transform(file_):
return week2month(pd.read_csv(file_))
hep = get_input_transform('hepatitis.csv')
mea = get_input_transform('measles.csv')
mum = get_input_transform('mumps.csv')
per = get_input_transform('pertussis.csv')
pol = get_input_transform('polio.csv')
rub = get_input_transform('rubella.csv')
sma = get_input_transform('smallpox.csv')
train_data = hep
for i in [mea,mum,per,pol,rub,sma]:
train_data = train_data.append(i)
train_data = train_data.loc[(train_data['Year'] >= 1960) & (train_data['Year'] <=2011)]
train_data.head()
# some data discrepancies must be resolved (issues like \\N )
train_data_bad = train_data[train_data.cases==train_data.cases.max()]
ind = list(train_data_bad.index)
train_data = train_data.drop(train_data.index[ind])
I have tried different article to debug the code and this is a thesis related matter.

ColumnTransformer , remainder - passthrough gives me Error

I have a small dataset, like this
_d=pd.DataFrame([
[1,2.0,'a','mango','2017-07-07',1],
[2,2.55,'b','apple','2017-08-07',0],
[3,5.7,np.nan,'bannan',np.nan,1],
[4,np.nan,'d','grpaes','2017-09-07',1],
[5,5.7,'e','pineapple','2017-10-07',0],
[6,8.3,np.nan,'orange','2017-01-07',0],
[5,5.7,'e',np.nan,'2017-10-07',1],
[6,np.nan,'f',np.nan,np.nan,0],
[7,6.8,'g','pomegranate','2017-02-07',1],
[np.nan,55.5,'h','water melon','2017-03-07',0],
[9,6.8,'i','mango',np.nan,1],
[10,3.5,np.nan,'orange','2017-06-07',1],
[11,2.78,'k','pomegranate','2017-09-07',0]
] ,columns=['ind','score','grade','group','da','target']
)
To handle NaN values and encode the category features, I used this code
y=_d['target']
x=_d.drop(['target'],axis=1)
int_columns=_d.select_dtypes(['float64','int64']).columns
obj_columns=_d.select_dtypes(['object','category']).columns
int_pipeline=Pipeline([
('impute_values',SimpleImputer(missing_values=np.nan,strategy='mean')),
('scaling',StandardScaler())
])
cat_pipeline=Pipeline([
('cat_impute',SimpleImputer(strategy='constant',fill_value='missing')),
('encoding',OneHotEncoder(drop='first'))
])
column_trans=ColumnTransformer(transformers=[
('int_p',int_pipeline,['ind', 'score']),
('cat_p',cat_pipeline,['grade', 'group'])
],remainder='passthrough')
mdl_pipeline=Pipeline([
('value_transform',column_trans)
])
transformed_data=mdl_pipeline.fit_transform(x,y)
When I run this code, I get the following error
ValueError: could not convert string to float: '2017-07-07'
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call
last) Input In [253], in <cell line: 27>()
18 column_trans=ColumnTransformer(transformers=[
19 ('int_p',int_pipeline,['ind', 'score']),
20 ('cat_p',cat_pipeline,['grade', 'group'])
21 ],remainder='passthrough')
23 mdl_pipeline=Pipeline([
24 ('value_transform',column_trans)
25 # ,('mdl',LogisticRegression())
26 ])
---> 27 transformed_data=mdl_pipeline.fit_transform(x,y)
File ~\Anaconda3\lib\site-packages\sklearn\pipeline.py:434, in
Pipeline.fit_transform(self, X, y, **fit_params)
432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
433 if hasattr(last_step, "fit_transform"):
--> 434 return last_step.fit_transform(Xt, y, **fit_params_last_step)
435 else:
436 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)
File
~\Anaconda3\lib\site-packages\sklearn\compose_column_transformer.py:699,
in ColumnTransformer.fit_transform(self, X, y)
696 self._validate_output(Xs)
697 self._record_output_indices(Xs)
--> 699 return self._hstack(list(Xs))
File
~\Anaconda3\lib\site-packages\sklearn\compose_column_transformer.py:783,
in ColumnTransformer._hstack(self, Xs)
778 converted_Xs = [
779 check_array(X, accept_sparse=True, force_all_finite=False)
780 for X in Xs
781 ]
782 except ValueError as e:
--> 783 raise ValueError(
784 "For a sparse output, all columns should "
785 "be a numeric or convertible to a numeric."
786 ) from e
788 return sparse.hstack(converted_Xs).tocsr()
789 else:
ValueError: For a sparse output, all columns should be a numeric or
convertible to a numeric.
The value error
ValueError: could not convert string to float: '2017-07-07'
doesnt make any sense, as I have set remainder='passthrough', In the columnTransformer, Why is my code not working

Set sparse_threshold=0 of your ColumnTransformer. Otherwise, according to doc:
If the output of the different transformers contains sparse matrices, these will be stacked as a sparse matrix if the overall density is lower than this value. Use sparse_threshold=0 to always return dense. When the transformed output consists of all dense data, the stacked result will be dense, and this keyword will be ignored.
So it's trying to convert results of OneHotEncoder to sprase matrix but it can't since sparse matrices require numerical values (hence attempt to convert to something numerical)

Repartition dask dataframe while maintaining its original sequence

I am trying to repartition multiple .parquet files in order to save a specific number of parquet files. I have a time-series data that is dependent on the number of observations (instead of timestamps) for each client, so I need to ensure that the partitioning will not split a series over two files. In addition, I want to preserve the order, since I have the labels stored elsewhere. Here is an example of what I am trying to do:
import pandas as pd
import dask.dataframe as dd
ids = [9635, 1536, 8477, 1088, 6411, 2251]
df = df = pd.DataFrame({
"partition" : [0]*3 + [1]*3 + [2]*3 + [3]*3 + [4]*3 + [5]*3,
"customer_id" : [ids[0]]*3 + [ids[1]]*3 + [ids[2]]*3 + [ids[3]]*3 + [ids[4]]*3 + [ids[5]]*3,
"x": range(18)})
# indexing on "customer_id" here
df = df.set_index("customer_id")
ddf = dd.from_pandas(df, npartitions=6)
ddf.to_parquet("my_parquets")
read_ddf = dd.read_parquet("my_parquets/*.parquet")
last_idx = [ids[-1]]
my_divisions = ids + last_idx
read_ddf.divisions = my_divisions
# Split into two equal partitions with three customers each
new_divisions = [my_divisions[0], my_divisions[3], my_divisions[5]]
new_ddf = read_df.repartition(divisions=new_divisions)
which raises an error:
ValueError: New division must be sorted
I have tried an alternative approach, which involves setting the "partition" column as the index and modifying the index to "ids" later, but this sorts my entire dataframe, which is undesired because the new sequence no longer matches the labels stored. This is shown here:
import pandas as pd
import dask.dataframe as dd
ids = [9635, 1536, 8477, 1088, 6411, 2251]
df = df = pd.DataFrame({
"partition" : [0]*3 + [1]*3 + [2]*3 + [3]*3 + [4]*3 + [5]*3,
"customer_id" : [ids[0]]*3 + [ids[1]]*3 + [ids[2]]*3 + [ids[3]]*3 + [ids[4]]*3 + [ids[5]]*3,
"x": range(18)})
# indexing on the defined "partition" instead
df = df.set_index("partition")
ddf = dd.from_pandas(df, npartitions=6)
ddf.to_parquet("my_parquets")
read_ddf = dd.read_parquet("my_parquets/*.parquet")
# my_range is equivalent to the list of partitions
my_range = [i for i in range(0,6)]
last_idx = [my_range[-1]]
my_divisions = my_range + last_idx
read_ddf.divisions = my_divisions
new_divisions = [0, 2, 4, 5]
new_ddf = read_ddf.repartition(divisions=new_divisions)
# Need the "customer_id" as index
new_ddf = new_ddf.set_index("customer_id", drop = True)
But this sorts the dataframe by the index and messes up the structure, while I would like to keep the original order.
print("Partition 0")
print(new_ddf.get_partition(0).compute())
print("-------------------")
print("Partition 1")
print(new_ddf.get_partition(1).compute())
print("-------------------")
print("Partition 2")
print(new_ddf.get_partition(2).compute())
Partition 0
Empty DataFrame
Columns: [x]
Index: []
-------------------
Partition 1
x
customer_id
1088 9
1088 10
1088 11
1536 3
1536 4
1536 5
-------------------
Partition 2
x
customer_id
2251 15
2251 16
2251 17
6411 12
6411 13
6411 14
8477 6
8477 7
8477 8
9635 0
9635 1
9635 2
Are there any workarounds for this issue? I am aware that set_index in dask is quite expensive, but none of the approaches are currently working. Also, in my case I already have the .parquet files with the preprocessed data, so I only created the initial dataframe using pandas for demonstration purposes (it would have been much easier to specify the number of partitions in the first step if I had all the data in pandas).

Tensorflow. Switching from BasicRNNCell to LSTMCell

I have built a RNN with BasicRNN now I want to use the LSTMCell but the passage does not seem trivial. What should I change?
First i define all the placeholders and variables:
X_placeholder = tf.placeholder(tf.float32, [batch_size, truncated_backprop_length, embedding_size])
Y_placeholder = tf.placeholder(tf.int32, [batch_size, truncated_backprop_length])
init_state = tf.placeholder(tf.float32, [batch_size, state_size])
W = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
b = tf.Variable(np.zeros((batch_size, num_classes)), dtype=tf.float32)
W2 = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
b2 = tf.Variable(np.zeros((batch_size, num_classes)), dtype=tf.float32)
Then I unstack the labels:
labels_series = tf.transpose(batchY_placeholder)
labels_series = tf.unstack(batchY_placeholder, axis=1)
inputs_series = X_placeholder
Then i define my RNN:
cell = tf.contrib.rnn.BasicLSTMCell(state_size, state_is_tuple = False)
states_series, current_state = tf.nn.dynamic_rnn(cell, inputs_series, initial_state = init_state)
The error that I get is:
InvalidArgumentError Traceback (most recent call last)
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/framework/common_shapes.py in _call_cpp_shape_fn_impl(op, input_tensors_needed, input_tensors_as_shapes_needed, debug_python_shape_fn, require_shape_fn)
669 node_def_str, input_shapes, input_tensors, input_tensors_as_shapes,
--> 670 status)
671 except errors.InvalidArgumentError as err:
/home/deepnlp2017/anaconda3/lib/python3.5/contextlib.py in __exit__(self, type, value, traceback)
65 try:
---> 66 next(self.gen)
67 except StopIteration:
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py in raise_exception_on_not_ok_status()
468 compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 469 pywrap_tensorflow.TF_GetCode(status))
470 finally:
InvalidArgumentError: Dimensions must be equal, but are 50 and 100 for 'rnn/while/basic_lstm_cell/mul' (op: 'Mul') with input shapes: [32,50], [32,100].
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-19-2ac617f4dde4> in <module>()
4 #cell = tf.contrib.rnn.BasicRNNCell(state_size)
5 cell = tf.contrib.rnn.BasicLSTMCell(state_size, state_is_tuple = False)
----> 6 states_series, current_state = tf.nn.dynamic_rnn(cell, inputs_series, initial_state = init_state)
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/ops/rnn.py in dynamic_rnn(cell, inputs, sequence_length, initial_state, dtype, parallel_iterations, swap_memory, time_major, scope)
543 swap_memory=swap_memory,
544 sequence_length=sequence_length,
--> 545 dtype=dtype)
546
547 # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/ops/rnn.py in _dynamic_rnn_loop(cell, inputs, initial_state, parallel_iterations, swap_memory, sequence_length, dtype)
710 loop_vars=(time, output_ta, state),
711 parallel_iterations=parallel_iterations,
--> 712 swap_memory=swap_memory)
713
714 # Unpack final output if not using output tuples.
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py in while_loop(cond, body, loop_vars, shape_invariants, parallel_iterations, back_prop, swap_memory, name)
2624 context = WhileContext(parallel_iterations, back_prop, swap_memory, name)
2625 ops.add_to_collection(ops.GraphKeys.WHILE_CONTEXT, context)
-> 2626 result = context.BuildLoop(cond, body, loop_vars, shape_invariants)
2627 return result
2628
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py in BuildLoop(self, pred, body, loop_vars, shape_invariants)
2457 self.Enter()
2458 original_body_result, exit_vars = self._BuildLoop(
-> 2459 pred, body, original_loop_vars, loop_vars, shape_invariants)
2460 finally:
2461 self.Exit()
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py in _BuildLoop(self, pred, body, original_loop_vars, loop_vars, shape_invariants)
2407 structure=original_loop_vars,
2408 flat_sequence=vars_for_body_with_tensor_arrays)
-> 2409 body_result = body(*packed_vars_for_body)
2410 if not nest.is_sequence(body_result):
2411 body_result = [body_result]
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/ops/rnn.py in _time_step(time, output_ta_t, state)
695 skip_conditionals=True)
696 else:
--> 697 (output, new_state) = call_cell()
698
699 # Pack state if using state tuples
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/ops/rnn.py in <lambda>()
681
682 input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t)
--> 683 call_cell = lambda: cell(input_t, state)
684
685 if sequence_length is not None:
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py in __call__(self, inputs, state, scope)
182 i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
183
--> 184 new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
185 self._activation(j))
186 new_h = self._activation(new_c) * sigmoid(o)
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py in binary_op_wrapper(x, y)
882 if not isinstance(y, sparse_tensor.SparseTensor):
883 y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
--> 884 return func(x, y, name=name)
885
886 def binary_op_wrapper_sparse(sp_x, y):
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py in _mul_dispatch(x, y, name)
1103 is_tensor_y = isinstance(y, ops.Tensor)
1104 if is_tensor_y:
-> 1105 return gen_math_ops._mul(x, y, name=name)
1106 else:
1107 assert isinstance(y, sparse_tensor.SparseTensor) # Case: Dense * Sparse.
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/ops/gen_math_ops.py in _mul(x, y, name)
1623 A `Tensor`. Has the same type as `x`.
1624 """
-> 1625 result = _op_def_lib.apply_op("Mul", x=x, y=y, name=name)
1626 return result
1627
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py in apply_op(self, op_type_name, name, **keywords)
761 op = g.create_op(op_type_name, inputs, output_types, name=scope,
762 input_types=input_types, attrs=attr_protos,
--> 763 op_def=op_def)
764 if output_structure:
765 outputs = op.outputs
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py in create_op(self, op_type, inputs, dtypes, input_types, name, attrs, op_def, compute_shapes, compute_device)
2395 original_op=self._default_original_op, op_def=op_def)
2396 if compute_shapes:
-> 2397 set_shapes_for_outputs(ret)
2398 self._add_op(ret)
2399 self._record_op_seen_by_control_dependencies(ret)
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py in set_shapes_for_outputs(op)
1755 shape_func = _call_cpp_shape_fn_and_require_op
1756
-> 1757 shapes = shape_func(op)
1758 if shapes is None:
1759 raise RuntimeError(
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py in call_with_requiring(op)
1705
1706 def call_with_requiring(op):
-> 1707 return call_cpp_shape_fn(op, require_shape_fn=True)
1708
1709 _call_cpp_shape_fn_and_require_op = call_with_requiring
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/framework/common_shapes.py in call_cpp_shape_fn(op, input_tensors_needed, input_tensors_as_shapes_needed, debug_python_shape_fn, require_shape_fn)
608 res = _call_cpp_shape_fn_impl(op, input_tensors_needed,
609 input_tensors_as_shapes_needed,
--> 610 debug_python_shape_fn, require_shape_fn)
611 if not isinstance(res, dict):
612 # Handles the case where _call_cpp_shape_fn_impl calls unknown_shape(op).
/home/deepnlp2017/.local/lib/python3.5/site-packages/tensorflow/python/framework/common_shapes.py in _call_cpp_shape_fn_impl(op, input_tensors_needed, input_tensors_as_shapes_needed, debug_python_shape_fn, require_shape_fn)
673 missing_shape_fn = True
674 else:
--> 675 raise ValueError(err.message)
676
677 if missing_shape_fn:
ValueError: Dimensions must be equal, but are 50 and 100 for 'rnn/while/basic_lstm_cell/mul' (op: 'Mul') with input shapes: [32,50], [32,100].

You should consider giving the error trace. Otherwise it is hard (or impossible) to help.
I reproduced the situation and found that the issue was coming from state unpacking, i.e. line c, h = state.
Try to set state_is_tuple to false i.e.
cell = tf.contrib.rnn.BasicLSTMCell(state_size, state_is_tuple=False)
I'm not sure why this is happening. Are you loading a previous model? What is your tensorflow version?
More information on TensorFlow RNN Cells:
I would suggest you to take a look at: WildML post, section "RNN CELLS, WRAPPERS AND MULTI-LAYER RNNS".
It states that:
BasicRNNCell – A vanilla RNN cell.
GRUCell – A Gated Recurrent Unit cell.
BasicLSTMCell – An LSTM cell based on Recurrent Neural Network Regularization. No peephole connection or cell clipping.
LSTMCell – A more complex LSTM cell that allows for optional peephole connections and cell clipping.
MultiRNNCell – A wrapper to combine multiple cells into a multi-layer cell.
DropoutWrapper – A wrapper to add dropout to input and/or output connections of a cell.
Given this, I would suggest you to switch from BasicRNNCell to BasicLSTMCell. Where Basic here means "use it unless you know what you are doing". If you want to try LSTMs without going into details, thats the way to go. It may be straightforward, just replace with it and voilà!
If not, share some of your code + error.
Hope it helps

The problem seems to be with the init_state variable.
Basic RNN cells only have one state variable while LSTM has a visible and a hidden state. Specify the options state_is_tuple=False will concat the two state variables into one, therefore double the size of what you have specified in the init_state declaration.
To avoid this, one can use the built-in zero_state method for an LSTMCell to initialize the state in the correct way without worrying about size differences.
So it would simply be:
init_state = cell.zero_state(batch_size, dtype)
Of course will will have to be placed after the line where the cell is built.

Convert FASTA to GenBank

Is there a way to use BioPython to convert FASTA files to a Genbank format? There are many answers on how to convert from Genbank to FASTA, but not the other way around.

before convert, you must asign alphabet to sequence (DNA or Protein)
from Bio import SeqIO
from Bio.Alphabet import generic_dna, generic_protein
input_handle = open("test.fasta", "rU")
output_handle = open("test.gb", "w")
sequences = list(SeqIO.parse(input_handle, "fasta"))
#asign generic_dna or generic_protein
for seq in sequences:
seq.seq.alphabet = generic_dna
count = SeqIO.write(sequences, output_handle, "genbank")
output_handle.close()
input_handle.close()
print "Coverted %i records" % count
for input:
>I28Q9A102FII8J rank=0668881 x=2144.0 y=1105.0 length=418
ACGTCATGAGAGTTTGATCATGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGATGAA
GCTCCAGCTTGCTGGGGTGGATTAGTGGCGAACGGGTGAGTAACACGTGAGTAACCTGCCCTTGACTCTGGGAT
AAGCGTTGGAAACGACGTCTAATACCGGATATGACGACCGATGGCATCATCTGGTTGTGGAAAGAATTTTGGTC
AAGGATGGACTCGCGGCCTATCAGGTAGTTGGTGAGGTAATGGCTCACCAAGCCTACGACGGGTAGCCGGCCTG
AGAGGGTGACCGGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCA
CAATGGGCGAAAGCCTGATGCAGCAACGCCGCGTGAGGGATGACGGCC
>I28Q9A102JMH72 rank=0320459 x=3829.0 y=3120.0 length=512
ACGTCATGAGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGCAGGCTTAACACATGCAAGTCGAGGGTAGAA
ATAGCTTGCTATTTTGAGACCGGCGCACGGGTGCGTAACGCGTATGCAATCTGCCTTTTACAGGGGAATAGCCC
AGAGAAATTTGGATTAATGCCCCATAGCGCTGCAGGGCGGCATCGCCGAGCAGCTAAAGTCACAACGGTAAAGA
TGAGCATGCGTCCCATTAGCTAGTTGGTAAGGTAACGGCTTACCAAGGCGATGATGGGTAGGGTCCTGAGAGGG
AGATCCCCCACACTGGTACTGAGACACGGACCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGG
GCGCAAGCCTGAACCAGCCATGCCGCGTGCAGGATGAAGGCCTTCGGGTTGTAAACTGCTTTTGACGGAACGAA
AAAGCT
you get:
LOCUS I28Q9A102FII8J 418 bp DNA UNK 01-JAN-1980
DEFINITION I28Q9A102FII8J rank=0668881 x=2144.0 y=1105.0 length=418
ACCESSION I28Q9A102FII8J
VERSION I28Q9A102FII8J
KEYWORDS .
SOURCE .
ORGANISM .
.
FEATURES Location/Qualifiers
ORIGIN
1 acgtcatgag agtttgatca tggctcagga cgaacgctgg cggcgtgctt aacacatgca
61 agtcgaacga tgaagctcca gcttgctggg gtggattagt ggcgaacggg tgagtaacac
121 gtgagtaacc tgcccttgac tctgggataa gcgttggaaa cgacgtctaa taccggatat
181 gacgaccgat ggcatcatct ggttgtggaa agaattttgg tcaaggatgg actcgcggcc
241 tatcaggtag ttggtgaggt aatggctcac caagcctacg acgggtagcc ggcctgagag
301 ggtgaccggc cacactggga ctgagacacg gcccagactc ctacgggagg cagcagtggg
361 gaatattgca caatgggcga aagcctgatg cagcaacgcc gcgtgaggga tgacggcc
//
LOCUS I28Q9A102JMH72 450 bp DNA UNK 01-JAN-1980
DEFINITION I28Q9A102JMH72 rank=0320459 x=3829.0 y=3120.0 length=512
ACCESSION I28Q9A102JMH72
VERSION I28Q9A102JMH72
KEYWORDS .
SOURCE .
ORGANISM .
.
FEATURES Location/Qualifiers
ORIGIN
1 acgtcatgag agtttgatcc tggctcagga tgaacgctag cggcaggctt aacacatgca
61 agtcgagggt agaaatagct tgctattttg agaccggcgc acgggtgcgt aacgcgtatg
121 caatctgcct tttacagggg aatagcccag agaaatttgg attaatgccc catagcgctg
181 cagggcggca tcgccgagca gctaaagtca caacggtaaa gatgagcatg cgtcccatta
241 gctagttggt aaggtaacgg cttaccaagg cgatgatggg tagggtcctg agagggagat
301 cccccacact ggtactgaga cacggaccag actcctacgg gaggcagcag tgaggaatat
361 tggtcaatgg gcgcaagcct gaaccagcca tgccgcgtgc aggatgaagg ccttcgggtt
421 gtaaactgct tttgacggaa cgaaaaagct
//

here's an update of Jose's answer for python3 and new biopython. Biopython doesn't use alphabets any longer. Maybe it will save you a bit of time.
from Bio import SeqIO
input_handle = open("test.fasta", "r")
output_handle = open("test.gb", "w")
sequences = list(SeqIO.parse(input_handle, "fasta"))
# assign molecule type
for seq in sequences:
seq.annotations['molecule_type'] = 'DNA'
count = SeqIO.write(sequences, output_handle, "genbank")
output_handle.close()
input_handle.close()
print("Converted {} records".format(count))

It is possible to convert the fasta to gb format for unsubmitted sequences, which dont have accession numbers. Yet to be submitted to NCBI.

Develop Reference

ios ruby-on-rails asp.net-mvc docker delphi jenkins grails google-sheets machine-learning dart

Key error in biopython while creating motif - biopython

Related

How do i fix this error TypeError: '>=' not supported between instances of 'int' and 'str'

ColumnTransformer , remainder - passthrough gives me Error

Repartition dask dataframe while maintaining its original sequence

Tensorflow. Switching from BasicRNNCell to LSTMCell

Convert FASTA to GenBank

Categories

Resources