How can I get the length of a protein chain from a PDB file with Biopython? - biopython

I have tried it this way first:
for model in structure:
for residue in model.get_residues():
if PDB.is_aa(residue):
x += 1
and then that way:
len(structure[0][chain])
But none of them seem to work...

Your code should work and give you the correct results.
from Bio import PDB
parser = PDB.PDBParser()
pdb1 ='./1bfg.pdb'
structure = parser.get_structure("1bfg", pdb1)
model = structure[0]
res_no = 0
non_resi = 0
for model in structure:
for chain in model:
for r in chain.get_residues():
if r.id[0] == ' ':
res_no +=1
else:
non_resi +=1
print ("Residues: %i" % (res_no))
print ("Other: %i" % (non_resi))
res_no2 = 0
non_resi2 = 0
for model in structure:
for residue in model.get_residues():
if PDB.is_aa(residue):
res_no2 += 1
else:
non_resi2 += 1
print ("Residues2: %i" % (res_no2))
print ("Other2: %i" % (non_resi2))
Output:
Residues: 126
Other: 99
Residues2: 126
Other2: 99
Your statement
print (len(structure[0]['A']))
gives you the sum (225) of all residues, in this case all amino acids and water atoms.
The numbers seem to be correct when compared to manual inspection using PyMol.
What is the specific error message you are getting or the output you are expecting? Any specific PDB file?
Since the PDB file is mostly used to store the coordinates of the resolved atoms, it is not always possible to get the full structure. Another approach would be use to the cif files.
from Bio import PDB
parser = PDB.PDBParser()
pdb1 ='./1bfg.cif'
m = PDB.MMCIF2Dict.MMCIF2Dict(pdb1)
if '_entity_poly.pdbx_seq_one_letter_code' in m.keys():
print ('Full structure:')
full_structure = (m['_entity_poly.pdbx_seq_one_letter_code'])
print (full_structure)
print (len(full_structure))
Output:
Full structure:
PALPEDGGSGAFPPGHFKDPKRLYCKNGGFFLRIHPDGRVDGVREKSDPHIKLQLQAEERGVVSIKGVSANRYLAMKEDGRLLASKSVTDECFFFERLESNNYNTYRSRKYTSWYVALKRTGQYKLGSKTGPGQKAILFLPMSAKS
146
For multiple chains:
from Bio import PDB
parser = PDB.PDBParser()
pdb1 ='./4hlu.cif'
m = PDB.MMCIF2Dict.MMCIF2Dict(pdb1)
if '_entity_poly.pdbx_seq_one_letter_code' in m.keys():
full_structure = m['_entity_poly.pdbx_seq_one_letter_code']
chains = m['_entity_poly.pdbx_strand_id']
for c in chains:
print('Chain %s' % (c))
print('Sequence: %s' % (full_structure[chains.index(c)]))

It's just:
from Bio.PDB import PDBParser
from Bio import PDB
pdb = PDBParser().get_structure("1bfg", "1bfg.pdb")
for chain in pdb.get_chains():
print(len([_ for _ in chain.get_residues() if PDB.is_aa(_)]))

I appreciated Peters' answer, but I also realized the res.id[0] == " " is more robust (i.e. HIE). PDB.is_aa() cannot detect HIE is an amino acid while HIE is ε-nitrogen protonated histidine. So I recommend:
from Bio import PDB
parser = PDB.PDBParser()
pdb1 ='./1bfg.pdb'
structure = parser.get_structure("1bfg", pdb)
model = structure[0]
res_no = 0
non_resi = 0
for model in structure:
for chain in model:
for r in chain.get_residues():
if r.id[0] == ' ':
res_no +=1
else:
non_resi +=1
print ("Residues: %i" % (res_no))
print ("Other: %i" % (non_resi))

I think you would actually want to do something like
m = Bio.PDB.MMCIF2Dict.MMCIF2Dict(pdb_cif_file)
if '_entity_poly.pdbx_seq_one_letter_code' in m.keys():
full_structure = m['_entity_poly.pdbx_seq_one_letter_code']
chains = m['_entity_poly.pdbx_strand_id']
for c in chains:
for ci in c.split(','):
print('Chain %s' % (ci))
print('Sequence: %s' % (full_structure[chains.index(c)]))

Related

Making a print function repeat on a new line everytime it prints

So I want this final print function to print its function on a new line every time it prints. I've tried various "\n" placements to make it work but to no avail. Any tips?
from datetime import date
currentYear = date.today().year
print('Hi. What is your name?')
name = input()
while True:
try:
print('How old are you, ' + name + '?')
age = int(input())
if age >= 0:
break
else:
print('That is not a valid number.')
except ValueError:
print('That is not a valid number')
ageinHundred = 100 - int(age)
y = currentYear + int(ageinHundred)
t = 'You will be 100 years old in the year ' + str(int((y)))
print(t)
print('Give me another number')
num = input()
f = (int(num) * t)
print(f)
I want the final print function (print(f)) to print f multiple times on a new line each time. Not one after the other like the above code does now.
Thanks!
Change the last couple of lines to:
# Put t inside a list so it does list multiplication instead
# of string multiplication
f = int(num) * [t]
# Then join the individual f-lists with newlines and print
print("\n".join(f))
For the f = line, inspect f to get a better idea of what's going on there.
For the join part, join takes a list of strings, inserts the given string (in this case "\n"; a newline), and "joins" it all together. Get used to using join. It is a very helpful function.
Try this:
from datetime import date
currentYear = date.today().year
print('Hi. What is your name?')
name = input()
while True:
try:
print('How old are you, ' + name + '?')
age = int(input())
if age >= 0:
break
else:
print('That is not a valid number.')
except ValueError:
print('That is not a valid number')
ageinHundred = 100 - int(age)
y = currentYear + int(ageinHundred)
t = 'You will be 100 years old in the year ' + str(int((y)))
print(t)
print('Give me another number')
num = input()
for i in range(0,int(num)):
print(t)

How to look for anagrams in 1 or 2 dictionaries?

This is our code now:
#anagram is a word formed by rearranging the letters of a different word
text = open("words.txt")
counter = 0
d = {}
e = {}
for word in text:
w = word
a = list(word)
s = sorted(a)
counter += 1
d[counter] = s
e[counter] = s
print(d)
print(e)
We want to ask python to show us the words which have the same values/letters. So for example: AAB is the same as BAA.
Our file exists from:
aba
aab
acaba
ackba
abaca
casaba
Does anyone know how to program this?

Size of weights extracted from a NN model becomes higher than the model

I tried extracting the weights from a .pb tensorflow model and stored them in a text file..the size of the text file itself is higher than the model..why is this happening..?
Thanks in advance
Code to extract weights :
import tensorflow as tf
from tensorflow.python.platform import gfile
from tensorflow.python.framework import tensor_util
import operator
from functools import reduce
import matplotlib.pyplot as plt
import zlib
import pickle
PB_PATH = 'quantized_graph_resnet.pb'
with tf.Session() as sess:
with gfile.FastGFile(PB_PATH,'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
sess.graph.as_default()
tf.import_graph_def(graph_def, name='')
graph_nodes = [n for n in graph_def.node]
wts = [n for n in graph_nodes if n.op=='Const']
def check(l):
for item in l:
if(type(item) is list):
return True
return False
weightsFreq = {}
f = open('weights.txt', 'w')
for n in wts:
print("Name of the node - %s" % n.name)
if(True):
l = (tensor_util.MakeNdarray(n.attr['value'].tensor)).tolist()
if(isinstance(l, int)):
f.write('%d' % l)
f.write(' ')
if l in weightsFreq:
weightsFreq[l]+=1
else:
weightsFreq[l]=1
continue
if(isinstance(l, float)):
continue
while(check(l)):
l = reduce(operator.concat, l)
for item in l :
f.write('%d' % item)
f.write(' ')
# print(item)
if item in weightsFreq:
weightsFreq[item]+=1
else:
weightsFreq[item]=1
# print("Value - ", tensor_util.MakeNdarray(n.attr['value'].tensor), type(tensor_util.MakeNdarray(n.attr['value'].tensor)), "\n")
Text files are a very inefficient way to store large quantities of decimal numbers, it uses one byte for each digit of each number, where a binary file would use a fixed-size representation (4 bytes per number with a single precision floating point number).
That's why the text file is much bigger than a binary one.

Parsing an input file which contains polynomials

Hello experienced pythoners.
The goal is simply to read in my own files which have the following format, and to then apply mathematical operations to these values and polynomials. The files have the following format:
m1:=10:
m2:=30:
Z1:=1:
Z2:=-1:
...
Some very similar variables, next come the laguerre polynomials
...
F:= (12.58295)*L(0,x)*L(1,y)*L(6,z) + (30.19372)*L(0,x)*L(2,y)*L(2,z) - ...:
Where L stands for a laguerre polynomial and takes two arguments.
I have written a procedure in Python which splits apart each line into a left and right hand side split using the "=" character as a divider. The format of these files is always the same, but the number of laguerre polynomials in F can vary.
import re
linestring = open("file.txt", "r").read()
linestring = re.sub("\n\n","\n",str(linestring))
linestring = re.sub(",\n",",",linestring)
linestring = re.sub("\\+\n","+",linestring)
linestring = re.sub(":=\n",":=",linestring)
linestring = re.sub(":\n","\n",linestring)
linestring = re.sub(":","",linestring)
LINES = linestring.split("\n")
for LINE in LINES:
LINE = re.sub(" ","",LINE)
print "LINE=", LINE
if len(LINE) <=0:
next
PAIR = LINE.split("=")
print "PAIR=", PAIR
LHS = PAIR[0]
RHS = PAIR[1]
print "LHS=", LHS
print "RHS=", RHS
The first re.sub block just deals with formatting the file and discarding characters that python will not be able to process; then a loop is performed to print 4 things, LINE, PAIR, LHS and RHS, and it does this nicely. using the example file from above the procedure will print the following:
LINE= m1=1
PAIR= ['m1', '1']
LHS= m1
RHS= 1
LINE= m2=1
PAIR= ['m2', '1']
LHS= m2
RHS= 1
LINE= Z1=-1
PAIR= ['Z1', '-1']
LHS= Z1
RHS= -1
LINE= Z2=-1
PAIR= ['Z2', '-1']
LHS= Z2
RHS= -1
LINE= F= 12.5*L(0,x)L(1,y) + 30*L(0,x)L(2,y)L(2,z)
PAIR=['F', '12.5*L(0,x)L(1,y) + 30*L(0,x)L(2,y)L(2,z)']
LHS= F
RHS= 12.5*L(0,x)L(1,y) + 30*L(0,x)L(2,y)L(2,z)
My question is what is the next best step to process this output and use it in a mathematical script, especially assigning the L to mean a laguerre polynomial? I tried putting the LHS and RHS into a dictionary, but found it troublesome to put F in it due to the laguerre polynomials.
Any ideas are welcome. Perhaps I am overcomplicating this and there is a much simpler way to parse this file.
Many thanks in advance
Your parsing algorithm doesn't seem to work correctly, as the RHS of your variables dont produce the expected result.
Also the first re.sub block where you want to format the file seems overly complicated. Assuming every statement in your input file is terminated by a colon, you could get rid of all whitespace and newlines and seperate the statements using the following code:
linestring = open('file.txt','r').read()
strippedstring = linestring.replace('\n','').replace(' ','')
statements = re.split(':(?!=)',strippedstring)[:-1]
Then you iterate over the statements and split each one in LHS and RHS:
for st in statements:
lhs,rhs = re.split(':=',st)
print 'lhs=',lhs
print 'rhs=',rhs
In the next step, try to distinguish normal float variables and polynomials:
#evaluate rhs
try:
#interpret as numeric constant
f = float(rhs)
print " ",f
except ValueError:
#interpret as laguerre-polynomial
summands = re.split('\+', re.sub('-','+-',rhs))
for s in summands:
m = re.match("^(?P<factor>-?[0-9]*(\.[0-9]*)?)(?P<poly>(\*?L\([0-9]+,[a-z]\))*)", s)
if not m:
print ' polynomial misformatted'
continue
f = m.group('factor')
print ' factor: ',f
p = m.group('poly')
for l in re.finditer("L\((?P<a>[0-9]+),(?P<b>[a-z])\)",p):
print ' poly: L(%s,%s)' % (l.group("a"),l.group("b"))
This should work for your given example file.

Parse from file to dictionary in correct order, in Python

I've written some code to parse an EMBL file and dump specific regions of the file into a dictionary.
The keys of the dictionary correlate to the label of a specific region that I want to capture and each key's value is the region itself.
I have then created another function to write the contents of the dictionary to a text file.
However, I have found that the text file contains the information in a different order to that found in the original EMBL file.
I can't figure out why it is doing this - is it because dictionaries are unordered? Is there any way around it?
from Bio import SeqIO
s6633 = SeqIO.read("6633_seq.embl", "embl")
def make_dict_realgenes(x):
dict = {}
for i in range(len(x.features)):
if x.features[i].type == 'CDS':
if 'hypothetical' not in x.features[i].qualifiers['product'][0]:
try:
if x.features[i].location.strand == -1:
x1 = x.features[i].location.end
y1 = x1 + 30
dict[str(x.features[i].qualifiers['product'][0])] =\
str(x[x1:y1].seq.reverse_complement())
else:
x2 = x.features[i].location.start
y2 = x2 - 30
dict[x.features[i].qualifiers['product'][0]] =\
str(x[y2:x2].seq)
except KeyError:
if x.features[i].location.strand == -1:
x1 = x.features[i].location.end
y1 = x1 + 30
dict[str(x.features[i].qualifiers['translation'][0])] =\
str(x[x1:y1].seq.reverse_complement())
else:
x2 = x.features[i].location.start
y2 = x2 - 30
dict[x.features[i].qualifiers['translation'][0]] =\
str(x[y2:x2].seq)
return dict
def rbs_file(dict):
list = []
c = 0
for k, v in dict.iteritems():
list.append(">" + k + " " + str(c) + "\n" + v + "\n")
c = c + 1
f = open("out.txt", "w")
a = 0
for i in list:
f.write(i)
a = a + 1
f.close()
To preserve order in a dictionary, use an OrderedDict from collections. Try Changing the top of your code to this:
from collections import OrderedDict
from Bio import SeqIO
s6633 = SeqIO.read("6633_seq.embl", "embl")
def make_dict_realgenes(x):
dict = OrderedDict()
...
Also, I would advise against overwriting the builtin 'dict' if you can easily rename it.
I slightly refactored your code, and I suggest to write the output as is produced while parsing the file, instead of relaying in OrderedDicts.
from Bio import SeqIO
output = open("out.txt", "w")
for seq in SeqIO.parse("CP001187.embl", "embl"):
for feature in seq.features:
if feature.type == "CDS":
qualifier = (feature.qualifiers.get("product") or
feature.qualifiers.get("translation"))[0]
if "hypothetical" not in qualifier:
if feature.location.strand == -1:
x1 = feature.location.end
x2 = x1 + 30
sequence = seq[x1:x2].seq.reverse_complement()
else:
x1 = feature.location.start
x2 = x1 - 30
sequence = seq[x2:x1].seq
output.write(">" + qualifier + "\n")
output.write(str(sequence) + "\n")
# You can always insert here to the OrderedDict anyway, e.g.
# d[qualifier] = str(sequence)
output.close()
In python only rarely for i in range(len(anything)) is the way to go.
There is also a cleaner way to output your sequences using Biopython. Use a list to append the Seqs, instead of a dict or OrderedDict:
from Bio.SeqRecord import SeqRecord
my_seqs = []
# Each time you generate a sequence, instead of writing to a file
# or inserting in dict, do this:
my_seqs.append(SeqRecord(sequence, id=qualifier, description=""))
# Now you have the my_seqs, they can be writen in a single line:
SeqIO.write(my_seqs, "output.fas", "fasta")

Resources