I am trying to repartition multiple .parquet files in order to save a specific number of parquet files. I have a time-series data that is dependent on the number of observations (instead of timestamps) for each client, so I need to ensure that the partitioning will not split a series over two files. In addition, I want to preserve the order, since I have the labels stored elsewhere. Here is an example of what I am trying to do:
import pandas as pd
import dask.dataframe as dd
ids = [9635, 1536, 8477, 1088, 6411, 2251]
df = df = pd.DataFrame({
"partition" : [0]*3 + [1]*3 + [2]*3 + [3]*3 + [4]*3 + [5]*3,
"customer_id" : [ids[0]]*3 + [ids[1]]*3 + [ids[2]]*3 + [ids[3]]*3 + [ids[4]]*3 + [ids[5]]*3,
"x": range(18)})
# indexing on "customer_id" here
df = df.set_index("customer_id")
ddf = dd.from_pandas(df, npartitions=6)
ddf.to_parquet("my_parquets")
read_ddf = dd.read_parquet("my_parquets/*.parquet")
last_idx = [ids[-1]]
my_divisions = ids + last_idx
read_ddf.divisions = my_divisions
# Split into two equal partitions with three customers each
new_divisions = [my_divisions[0], my_divisions[3], my_divisions[5]]
new_ddf = read_df.repartition(divisions=new_divisions)
which raises an error:
ValueError: New division must be sorted
I have tried an alternative approach, which involves setting the "partition" column as the index and modifying the index to "ids" later, but this sorts my entire dataframe, which is undesired because the new sequence no longer matches the labels stored. This is shown here:
import pandas as pd
import dask.dataframe as dd
ids = [9635, 1536, 8477, 1088, 6411, 2251]
df = df = pd.DataFrame({
"partition" : [0]*3 + [1]*3 + [2]*3 + [3]*3 + [4]*3 + [5]*3,
"customer_id" : [ids[0]]*3 + [ids[1]]*3 + [ids[2]]*3 + [ids[3]]*3 + [ids[4]]*3 + [ids[5]]*3,
"x": range(18)})
# indexing on the defined "partition" instead
df = df.set_index("partition")
ddf = dd.from_pandas(df, npartitions=6)
ddf.to_parquet("my_parquets")
read_ddf = dd.read_parquet("my_parquets/*.parquet")
# my_range is equivalent to the list of partitions
my_range = [i for i in range(0,6)]
last_idx = [my_range[-1]]
my_divisions = my_range + last_idx
read_ddf.divisions = my_divisions
new_divisions = [0, 2, 4, 5]
new_ddf = read_ddf.repartition(divisions=new_divisions)
# Need the "customer_id" as index
new_ddf = new_ddf.set_index("customer_id", drop = True)
But this sorts the dataframe by the index and messes up the structure, while I would like to keep the original order.
print("Partition 0")
print(new_ddf.get_partition(0).compute())
print("-------------------")
print("Partition 1")
print(new_ddf.get_partition(1).compute())
print("-------------------")
print("Partition 2")
print(new_ddf.get_partition(2).compute())
Partition 0
Empty DataFrame
Columns: [x]
Index: []
-------------------
Partition 1
x
customer_id
1088 9
1088 10
1088 11
1536 3
1536 4
1536 5
-------------------
Partition 2
x
customer_id
2251 15
2251 16
2251 17
6411 12
6411 13
6411 14
8477 6
8477 7
8477 8
9635 0
9635 1
9635 2
Are there any workarounds for this issue? I am aware that set_index in dask is quite expensive, but none of the approaches are currently working. Also, in my case I already have the .parquet files with the preprocessed data, so I only created the initial dataframe using pandas for demonstration purposes (it would have been much easier to specify the number of partitions in the first step if I had all the data in pandas).
Related
My Dataset is a set of system calls for both malware and benign, I preprocessed it and now it looks like this
NtQueryPerformanceCounter
NtProtectVirtualMemory
NtProtectVirtualMemory
NtQuerySystemInformation
NtQueryVirtualMemory
NtQueryVirtualMemory
NtProtectVirtualMemory
NtOpenKey
NtOpenKey
NtOpenKey
NtQuerySecurityAttributesToken
NtQuerySecurityAttributesToken
NtQuerySystemInformation
NtQuerySystemInformation
NtAllocateVirtualMemory
NtFreeVirtualMemory
Now I'm using tfidf to extract the features and then use ngram to make a sequence of them
from __future__ import print_function
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.svm import OneClassSVM
nGRAM1 = 8
nGRAM2 = 10
weight = 4
main_corpus_MAL = []
main_corpus_target_MAL = []
main_corpus_BEN = []
main_corpus_target_BEN = []
my_categories = ['benign', 'malware']
# feeding corpus the testing data
print("Loading system call database for categories:")
print(my_categories if my_categories else "all")
import glob
import os
malCOUNT = 0
benCOUNT = 0
for filename in glob.glob(os.path.join('C:\\Users\\alika\\Documents\\testingSVM\\sysMAL', '*.txt')):
fMAL = open(filename, "r")
aggregate = ""
for line in fMAL:
linea = line[:(len(line)-1)]
aggregate += " " + linea
main_corpus_MAL.append(aggregate)
main_corpus_target_MAL.append(1)
malCOUNT += 1
for filename in glob.glob(os.path.join('C:\\Users\\alika\\Documents\\testingSVM\\sysBEN', '*.txt')):
fBEN = open(filename, "r")
aggregate = ""
for line in fBEN:
linea = line[:(len(line) - 1)]
aggregate += " " + linea
main_corpus_BEN.append(aggregate)
main_corpus_target_BEN.append(0)
benCOUNT += 1
# weight as determined in the top of the code
train_corpus = main_corpus_BEN[:(weight*len(main_corpus_BEN)//(weight+1))]
train_corpus_target = main_corpus_target_BEN[:(weight*len(main_corpus_BEN)//(weight+1))]
test_corpus = main_corpus_MAL[(len(main_corpus_MAL)-(len(main_corpus_MAL)//(weight+1))):]
test_corpus_target = main_corpus_target_MAL[(len(main_corpus_MAL)-len(main_corpus_MAL)//(weight+1)):]
def size_mb(docs):
return sum(len(s.encode('utf-8')) for s in docs) / 1e6
# size of datasets
train_corpus_size_mb = size_mb(train_corpus)
test_corpus_size_mb = size_mb(test_corpus)
print("%d documents - %0.3fMB (training set)" % (
len(train_corpus_target), train_corpus_size_mb))
print("%d documents - %0.3fMB (test set)" % (
len(test_corpus_target), test_corpus_size_mb))
print("%d categories" % len(my_categories))
print()
print("Benign Traces: "+str(benCOUNT)+" traces")
print("Malicious Traces: "+str(malCOUNT)+" traces")
print()
print("Extracting features from the training data using a sparse vectorizer...")
t0 = time()
vectorizer = TfidfVectorizer(ngram_range=(nGRAM1, nGRAM2), min_df=1, use_idf=True, smooth_idf=True) ##############
analyze = vectorizer.build_analyzer()
X_train = vectorizer.fit_transform(train_corpus)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, train_corpus_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()
print("Extracting features from the test data using the same vectorizer...")
t0 = time()
X_test = vectorizer.transform(test_corpus)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, test_corpus_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()
The output is:
Loading system call database for categories:
['benign', 'malware']
177 documents - 45.926MB (training set)
44 documents - 12.982MB (test set)
2 categories
Benign Traces: 72 traces
Malicious Traces: 150 traces
Extracting features from the training data using a sparse vectorizer...
done in 7.831695s at 5.864MB/s
n_samples: 177, n_features: 603170
Extracting features from the test data using the same vectorizer...
done in 1.624100s at 7.993MB/s
n_samples: 44, n_features: 603170
Now for the learning section I'm trying to use sklearn OneClassSVM:
print("==================\n")
print("Training: ")
classifier = OneClassSVM(kernel='linear', gamma='auto')
classifier.fit(X_test)
fraud_pred = classifier.predict(X_test)
unique, counts = np.unique(fraud_pred, return_counts=True)
print (np.asarray((unique, counts)).T)
fraud_pred = pd.DataFrame(fraud_pred)
fraud_pred= fraud_pred.rename(columns={0: 'prediction'})
main_corpus_target = pd.DataFrame(main_corpus_target)
main_corpus_target= main_corpus_target.rename(columns={0: 'Category'})
this the output to fraud_pred and main_corpus_target
prediction
0 1
1 -1
2 1
3 1
4 1
5 -1
6 1
7 -1
...
30 rows * 1 column
====================
Category
0 1
1 1
2 1
3 1
4 1
...
217 0
218 0
219 0
220 0
221 0
222 rows * 1 column
but when i try to calculate TP,TN,FP,FN:
##Performance check of the model
TP = FN = FP = TN = 0
for j in range(len(main_corpus_target)):
if main_corpus_target['Category'][j]== 0 and fraud_pred['prediction'][j] == 1:
TP = TP+1
elif main_corpus_target['Category'][j]== 0 and fraud_pred['prediction'][j] == -1:
FN = FN+1
elif main_corpus_target['Category'][j]== 1 and fraud_pred['prediction'][j] == 1:
FP = FP+1
else:
TN = TN +1
print (TP, FN, FP, TN)
I get this error:
KeyError Traceback (most recent call last)
<ipython-input-32-1046cc75ba83> in <module>
7 elif main_corpus_target['Category'][j]== 0 and fraud_pred['prediction'][j] == -1:
8 FN = FN+1
----> 9 elif main_corpus_target['Category'][j]== 1 and fraud_pred['prediction'][j] == 1:
10 FP = FP+1
11 else:
c:\users\alika\appdata\local\programs\python\python36\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
1069 key = com.apply_if_callable(key, self)
1070 try:
-> 1071 result = self.index.get_value(self, key)
1072
1073 if not is_scalar(result):
c:\users\alika\appdata\local\programs\python\python36\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
4728 k = self._convert_scalar_indexer(k, kind="getitem")
4729 try:
-> 4730 return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
4731 except KeyError as e1:
4732 if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 30
1) I know the error is because it's trying to access a key that isn’t in a dictionary, but i can't just insert some numbers in the fraud_pred to handle this issue, any suggestions??
2) Am i doing anything wrong that they don't match?
3) I want to compare the results to other one class classification algorithms, Due to my method, what are the best ones that i can use??
Edit: Before calculating the metrics:
You could change your fit and predict functions to:
fraud_pred = classifier.fit_predict(X_test)
Also, your main_corpus_target and X_test should have the same length, can you put the code where you create main_corpus_target please?
its created it right after the benCOUNT += 1:
main_corpus_target = main_corpus_target_MAL main_corpus_target.extend(main_corpus_target_BEN)
This means that you are creating a main_corpus_target that includes MAL and BEN, and the error you get is:
ValueError: Found input variables with inconsistent numbers of samples: [30, 222]
The number of samples of fraud_pred is 30, so you should evaluate them with an array of 30. main_corpus_target contains 222.
Watching your code, I see that you want to evaluate the X_test, which is related to test_corpus X_test = vectorizer.transform(test_corpus). It would be better to compare your results to test_corpus_target, which is the target variable of your dataset and also has a length of 30.
These two lines that you have should output the same length:
test_corpus = main_corpus_MAL[(len(main_corpus_MAL)-(len(main_corpus_MAL)//(weight+1))):]
test_corpus_target = main_corpus_target_MAL[(len(main_corpus_MAL)-len(main_corpus_MAL)//(weight+1)):]
May I ask why are you calculating the TP, TN... by yourself?
You have a faster option:
Transform the fraud_pred series, replacing the -1 to 0.
Use the confusion matrix function that sklearn offers.
Use ravel to extract the values of the confusion matrix.
An example, after transforming the -1 to 0:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(fraud_pred, main_corpus_target['Category'].values).ravel()
Also, if you are using the last pandas version:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(fraud_pred, main_corpus_target['Category'].to_numpy()).ravel()
I have a dataset of 1127 patients. My goal was to classify each patient to 0 or 1.
I have two different classifiers but with the same purpose - to classify the patient to 0 or 1.
I've run one classifier on 364 patients and the second classifier on the 763 patients.
for each classifier\group, I generated the ROC curve.
Now, I would like to combine the curves.
someone could guide me on how to do it?
I'm thinking of calculating the weighted FPR and TPR, but I'm not sure how to do it.
The number of FPR\TPR pairs is different between the curves (The first ROC curve based on 312 pairs and the second ROC curve based on 666 pairs).
Thanks!!!
Imports
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
Data generation
# simulate first dataset with 364 obs
df1 = \
pd.DataFrame(i for i in range(364))
df1['predict_proba_1'] = np.random.normal(0,1,len(df1))
df1['epsilon'] = np.random.normal(0,1,len(df1))
df1['true'] = (0.7*df1['epsilon'] < df1['predict_proba_1']) * 1
df1 = df1.drop(columns=[0, 'epsilon'])
# simulate second dataset with 763 obs
df2 = \
pd.DataFrame(i for i in range(763))
df2['predict_proba_2'] = np.random.normal(0,1,len(df2))
df2['epsilon'] = np.random.normal(0,1,len(df2))
df2['true'] = (0.7*df2['epsilon'] < df2['predict_proba_2']) * 1
df2 = df2.drop(columns=[0, 'epsilon'])
Quick look at generated data
df1
predict_proba_1 true
0 1.234549 1
1 -0.586544 0
2 -0.229539 1
3 0.132185 1
4 -0.411284 0
.. ... ...
359 -0.218775 0
360 -0.985565 0
361 0.542790 1
362 -0.463667 0
363 1.119244 1
[364 rows x 2 columns]
df2
predict_proba_2 true
0 0.278755 1
1 0.653663 0
2 -0.304216 1
3 0.955658 1
4 -1.341669 0
.. ... ...
758 1.359606 1
759 -0.605894 0
760 0.379738 0
761 1.571615 1
762 -1.102565 0
[763 rows x 2 columns]
Necessary functions
def show_ROCs(scores_list: list, ys_list: list, labels_list:list = None):
"""
This function plots a couple of ROCs. Corresponding labels are optional.
Parameters
----------
scores_list : list of array-likes with scorings or predicted probabilities.
ys_list : list of array-likes with ground true labels.
labels_list : list of labels to be displayed in plotted graph.
Returns
----------
None
"""
if len(scores_list) != len(ys_list):
raise Exception('len(scores_list) != len(ys_list)')
fpr_dict = dict()
tpr_dict = dict()
for x in range(len(scores_list)):
fpr_dict[x], tpr_dict[x], _ = roc_curve(ys_list[x], scores_list[x])
for x in range(len(scores_list)):
try:
plot_ROC(fpr_dict[x], tpr_dict[x], str(labels_list[x]) + ' AUC:' + str(round(auc(fpr_dict[x], tpr_dict[x]),3)))
except:
plot_ROC(fpr_dict[x], tpr_dict[x], str(x) + ' ' + str(round(auc(fpr_dict[x], tpr_dict[x]),3)))
plt.show()
def plot_ROC(fpr, tpr, label):
"""
This function plots a single ROC. Corresponding label is optional.
Parameters
----------
fpr : array-likes with fpr.
tpr : array-likes with tpr.
label : label to be displayed in plotted graph.
Returns
----------
None
"""
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label=label)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
Plotting
show_ROCs(
[df1['predict_proba_1'], df2['predict_proba_2']],
[df1['true'], df2['true']],
['df1 with {} obs'.format(len(df1)), 'df2 with {} obs'.format(len(df2))]
)
Following a subsampling inside of resampling procedure, as exemplified here https://topepo.github.io/caret/subsampling-for-class-imbalances.html#subsampling-during-resampling my question is simply how to extract the actual data-set resulting from this procedure when the caret method = “rf” and the sampling method is “smote”.
If, for example, method= glm is used then the data can be extracted with model$finalModel$data; if the method = “rpart” the data can be similarly extracted with model$finalModel$call$data.
Using subsampling inside of resampling and the method = rpart the smote data-set can be extrated as follows:
library(caret)
library(DMwR)
data("GermanCredit")
set.seed(122)
index1<-createDataPartition(GermanCredit$Class, p=.7, list = FALSE)
training<-GermanCredit[index1, ]
#testing<-GermanCredit[-index1,]
colnames(training)
metric <- "ROC"
ctrl1<- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5,
search = "random",
classProbs = TRUE, # note class probabilities included
savePredictions = T, #"final"
returnResamp = "final",
allowParallel = TRUE,
summaryFunction = twoClassSummary,
sampling = "smote")
set.seed(1)
mod_fit<-train(Class ~ Age +
ForeignWorker +
Property.RealEstate +
Housing.Own +
CreditHistory.Critical, data=training, method="rpart",
metric = metric,
trControl= ctrl1)
mod_fit # ROC 0.5951215
dat_smote<- mod_fit$finalModel$call$data
table(dat_smote$.outcome)
# Bad Good
# 630 840
head(dat_smote)
# Age ForeignWorker Property.RealEstate Housing.Own CreditHistory.Critical .outcome
# 40 1 0 1 1 Good
# 29 1 0 0 0 Good
# 37 1 1 0 1 Good
# 47 1 0 0 0 Good
# 53 1 0 1 0 Good
# 29 1 0 1 0 Good
I simply would like to be able to perform the same data-set extraction when the method = "rf". The code might look like this
dat<- mod_fit$trainingData[mod_fit$trainingData == mod_fit$finalModel$x,]
I think that the only way to do it is to write a custom model that saves the data object in the fit module (that's pretty unsatisfying though).
I am attempting to take a dask dataframe, group by column 'A' and remove the groups where there are fewer than MIN_SAMPLE_COUNT rows.
For example, the following code works in pandas:
import pandas as pd
import dask as da
MIN_SAMPLE_COUNT = 1
x = pd.DataFrame([[1,2,3], [1,5,6], [2,8,9], [1,3,5]])
x.columns = ['A', 'B', 'C']
grouped = x.groupby('A')
x = grouped.filter(lambda x: x['A'].count().astype(int) > MIN_SAMPLE_COUNT)
However, in Dask if I try something analogous:
import pandas as pd
import dask
MIN_SAMPLE_COUNT = 1
x = pd.DataFrame([[1,2,3], [1,5,6], [2,8,9], [1,3,5]])
x.columns = ['A', 'B', 'C']
x = dask.dataframe.from_pandas(x, npartitions=2)
grouped = x.groupby('A')
x = grouped.filter(lambda x: x['A'].count().astype(int) > MIN_SAMPLE_COUNT)
I get the following error message:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\groupby.py in __getattr__(self, key)
1162 try:
-> 1163 return self[key]
1164 except KeyError as e:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\groupby.py in __getitem__(self, key)
1153 # error is raised from pandas
-> 1154 g._meta = g._meta[key]
1155 return g
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\base.py in __getitem__(self, key)
274 if key not in self.obj:
--> 275 raise KeyError("Column not found: {key}".format(key=key))
276 return self._gotitem(key, ndim=1)
KeyError: 'Column not found: filter'
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<ipython-input-55-d8a969cc041b> in <module>()
1 # Remove sixty second blocks that have fewer than MIN_SAMPLE_COUNT samples.
2 grouped = dat.groupby('KPI_60_seconds')
----> 3 dat = grouped.filter(lambda x: x['KPI_60_seconds'].count().astype(int) > MIN_SAMPLE_COUNT)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\groupby.py in __getattr__(self, key)
1163 return self[key]
1164 except KeyError as e:
-> 1165 raise AttributeError(e)
1166
1167 #derived_from(pd.core.groupby.DataFrameGroupBy)
AttributeError: 'Column not found: filter'
The error message suggests that the filter method used in Pandas has not been implemented in Dask (nor did I find it after a search).
Is there a Dask functionality which captures what I am looking to do? I have gone through the Dask API and nothing stood out to me as what I need. I am currently using Dask '1.1.1'
Thank you for your help.
Fairly new to Dask myself. One way to achieve you are trying could be as follows:
Dask version: 0.17.3
import pandas as pd
import dask.dataframe as dd
MIN_SAMPLE_COUNT = 1
x = pd.DataFrame([[1,2,3], [1,5,6], [2,8,9], [1,3,5]])
x.columns = ['A', 'B', 'C']
print("x (before):")
print(x) # still pandas
x = dd.from_pandas(x, npartitions=2)
grouped = x.groupby('A').B.count().reset_index()
grouped = grouped.rename(columns={'B': 'Count'})
y = dd.merge(x, grouped, on=['A'])
y = y[y.Count > MIN_SAMPLE_COUNT]
x = y[['A', 'B', 'C']]
print("x (after):")
print(x.compute()) # needs compute for conversion to pandas df
Output:
x (before):
A B C
0 1 2 3
1 1 5 6
2 2 8 9
3 1 3 5
x (after):
A B C
0 1 2 3
1 1 5 6
1 1 3 5
I am trying generate a weblogo for the protein sequences provided. The following is my code:
from Bio.Seq import Seq
from Bio import motifs
from Bio.Alphabet import generic_protein
instances = [Seq("RWST"),
Seq("RTAG"),
Seq("RQGC"),
Seq("RMAA"),
]
m = motifs.create(instances)
m.weblogo("mymotif.png")
I get the following error:
counts[letter][position] += 1
KeyError: 'R'
Full stack trace:
<ipython-input-3-ee8922743152> in <module>()
10
11
---> 12 m = motifs.create(instances)
13 m.weblogo("mymotif.png")
lib/site-packages/Bio/motifs/__init__.py in create(instances, alphabet)
21 def create(instances, alphabet=None):
22 instances = Instances(instances, alphabet)
---> 23 return Motif(instances=instances, alphabet=alphabet)
24
25
lib/site-packages/Bio/motifs/__init__.py in __init__(self, alphabet, instances, counts)
236 self.instances = instances
237 alphabet = self.instances.alphabet
--> 238 counts = self.instances.count()
239 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts)
240 self.length = self.counts.length
lib/site-packages/Bio/motifs/__init__.py in count(self)
192 for instance in self:
193 for position, letter in enumerate(instance):
--> 194 counts[letter][position] += 1
195 return counts
196
KeyError: 'R'
Motif takes an alphabet as a keyword (named) argument, so does motifs.create. If there is none, BioPython assumes the sequence is a DNA and in your case R is not found in the alphabet.
For your example you would need to use IUPAC.protein to make it work.
Note: BioPython uses letters internally to see which characters are available, genericProtein has no letters.
from Bio import motifs
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
instances = [Seq("RWST", IUPAC.protein),
Seq("RTAG", IUPAC.protein),
Seq("RQGC", IUPAC.protein),
Seq("RMAA", IUPAC.protein),
]
m = motifs.create(instances, IUPAC.protein)
m.weblogo("mymotif.png")