Apache Beam - Multiple Pcollection - Dataframetransform Issue - google-cloud-dataflow

I am running a below sample in apache beam
import apache_beam as beam
from apache_beam import Row
from apache_beam import Pipeline
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import WorkerOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.dataframe.convert import to_dataframe, to_pcollection
from apache_beam.dataframe.transforms import DataframeTransform
import logging
import argparse
import sys
import pandas
logging.getLogger().setLevel(logging.INFO)
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(sys.argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
# Just a Dummy Dataframe Transform Function, Ignore the logic
def transformdf(a, b):
a["addr"] = "addr-common"
return a
p = beam.Pipeline(options=pipeline_options)
# Schema Aware Pcollection
data1 = [Row(id=1, name="abc"), Row(id=2, name="def"), Row(id=3, name="ghi")]
pcol1 = (p | "Create1" >> beam.Create(data1))
data2 = [Row(addr="addr1"), Row(addr="addr2"), Row(addr="addr3")]
pcol2 = (p | "Create2" >> beam.Create(data2))
pcol = ({"a":pcol1, "b":pcol2} | "TransformedDF" >> DataframeTransform(transformdf))
# The above throws issue with duplicate label error
pcol | "Map" >> beam.Map(lambda row: {"id":row.id, "name":row.name, "addr":row.addr}) | "Print" >> beam.Map(print)
p.run().wait_until_finish()
The code errors out with the error
`
RuntimeError: A transform with label "TransformedDF/BatchElements(pc)" already exists in the pipeline
`
The syntax and usage seems correct as per the link
https://beam.apache.org/documentation/dsls/dataframes/overview/#embedding-dataframes-in-a-pipeline
output = {"a":pcol1, "b":pcol2"} | DataframeTransform(lambda/function)
I am currently using apache beam 2.35.0
Is this issue with Python SDK?

Related

How to save the contour plot with .png format in python

I want to save contour in python. I can't do it very well.
I used plt.savefig() but image is empty.
why?
!pip install tftb
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
from tftb.generators import atoms
import tftb
import cv2
mat = sio.loadmat('/content/drive/MyDrive/z5_25.mat')
signal = mat['z']
z = signal.T
images_dir = '/content/drive/MyDrive/image'
for i in range(122):
wvd = tftb.processing.WignerVilleDistribution(z[i])
wvd.run()
fig = plt.figure()
plt.rcParams['figure.figsize']=(1.5,1.5)
wvd.plot(kind = 'contour')
plt.savefig(f"{images_dir}/fig5_25_{i}.png")
plt.show()
I know it is late but you can do so by replacing plt with wvd in the line
wvd.plot(kind = 'contour') plt.savefig(f"{images_dir}/fig5_25_{i}.png")
like this:
wvd.plot(kind = 'contour') wvd.savefig(f"{images_dir}/fig5_25_{i}.png")
it should work

How can I use "if __name__ == '__main__':" in my code below?

After I run my code I get a warning as:
"RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if name == 'main':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable."
My question is that how and where I should add this line of code to avoid this warning/error in my code below:
from scipy import stats, optimize
import pymc3 as pm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
#from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from theano import shared
np.random.seed(9)
#Load the Data
dataset = pd.read_csv(‘PV-PCM.csv’)
X=dataset.iloc[:,[0,1,2,3,4]].values
y=dataset.iloc[:,5].values
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = 0.2, random_state=42)
#Shapes
X.shape, y.shape, X_tr.shape, X_te.shape
#Preprocess data for Modeling
shA_X = shared(X_tr)
#Generate Model
linear_model = pm.Model()
with linear_model:
# Priors for unknown model parameters
alpha = pm.Normal("alpha", mu=y_tr.mean(),sd=10)
betas = pm.Normal("betas", mu=0, sd=1000, shape=X.shape[1])
sigma = pm.HalfNormal("sigma", sd=100) # you could also try with a HalfCauchy that has longer/fatter tails
mu = alpha + pm.math.dot(betas, X_tr.T)
likelihood = pm.Normal("likelihood", mu=mu, sd=sigma, observed=y_tr)
step = pm.NUTS()
trace = pm.sample(1000, step)
chain = trace[100:]
#pm.traceplot(chain);
#Traceplot
pm.traceplot(trace)
ppc = pm.sample_prior_predictive(samples=1000, random_seed=9)
pm.plot_posterior(trace, figsize = (12, 10))
sns.kdeplot(y_tr, alpha=0.5, lw=4, c=‘b’)
for i in range(100):
sns.kdeplot(ppc[‘likelihood’][i], alpha=0.1, c=‘g’)
alpha_pred = chain[‘alpha’].mean()
betas_pred = chain[‘betas’].mean(axis=0)
y_pred = alpha_pred + np.dot(betas_pred, X_tr.T)
Thank you all.

How to implement LIME in a Bert model?

I am new to machine learning. I noticed that such questions have been asked before as well but did not receive a proper solution. Below is the code for semantic similarity and I want to implement LIME as a base. Please, help me out.
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
# Two lists of sentences
sentences1 = ['The cat sits outside',
'A man is playing guitar',
'The new movie is awesome']
sentences2 = ['The cat sits outside',
'A woman watches TV',
'The new movie is so great']
#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
#Output the pairs with their score
for i in range(len(sentences1)):
print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))
I don't know what Bert is, but try this sample code and see if it helps you.
import pandas as pd
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
from sklearn.utils import shuffle
from io import StringIO
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline
df = pd.read_csv('C:\\Users\\ryans\\OneDrive\\Desktop\\Briefcase\\PDFs\\1-ALL PYTHON & R CODE SAMPLES\\A - GITHUB\\Natural Language Processing - Amazon Reviews\\Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')
# let's experiment with some sentiment analysis concepts
# first we need to clean up the stuff in the independent field of the DF we are workign with
df.replace('\'','', regex=True, inplace=True)
df['review_title'] = df[['reviews.title']].astype(str)
df['review_text'] = df[['reviews.text']].astype(str)
df['review_title'] = df['reviews.title'].str.replace('\d+', '')
df['review_text'] = df['reviews.text'].str.replace('\d+', '')
# get rid of special characters
df['review_title'] = df['reviews.title'].str.replace(r'[^\w\s]+', '')
df['review_text'] = df['reviews.text'].str.replace(r'[^\w\s]+', '')
# get rid of double spaces
df['review_title'] = df['reviews.title'].str.replace(r'\^[a-zA-Z]\s+', '')
df['review_text'] = df['reviews.text'].str.replace(r'\^[a-zA-Z]\s+', '')
# convert all case to lower
df['review_title'] = df['reviews.title'].str.lower()
df['review_text'] = df['reviews.text'].str.lower()
list_corpus = df["review_text"].tolist()
list_labels = df["reviews.rating"].tolist()
X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, random_state=40)
vectorizer = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words = 'english', binary=True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors, y_train)
pred = logreg.predict(test_vectors)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
f1 = f1_score(y_test, pred, average='weighted')
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
list_corpus[3]
c = make_pipeline(vectorizer, logreg)
class_names=list(df.review_title.unique())
explainer = LimeTextExplainer(class_names=class_names)
idx = 3
exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=6, labels=[1, 1])
print('Document id: %d' % idx)
print('Predicted class =', class_names[logreg.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[y_test[idx]])
print ('Explanation for class %s' % class_names[1])
print ('\n'.join(map(str, exp.as_list(label=1))))
exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=6, top_labels=2)
print(exp.available_labels())
exp.show_in_notebook(text=False)
https://towardsdatascience.com/explain-nlp-models-with-lime-shap-5c5a9f84d59b
https://marcotcr.github.io/lime/tutorials/Lime%20-%20multiclass.html
https://towardsdatascience.com/understanding-model-predictions-with-lime-a582fdff3a3b

PyTorch - RuntimeError: Error(s) in loading state_dict for VGG:

I've trained a model using PyTorch and saved a state dict file. I have loaded the pre-trained model using the code below. I am getting an error message regarding RuntimeError: Error(s) in loading state_dict for VGG:
RuntimeError: Error(s) in loading state_dict for VGG:
Missing key(s) in state_dict: "features.0.weight", "features.0.bias", "features.2.weight", "features.2.bias", "features.5.weight", "features.5.bias", "features.7.weight", "features.7.bias", "features.10.weight", "features.10.bias", "features.12.weight", "features.12.bias", "features.14.weight", "features.14.bias", "features.17.weight", "features.17.bias", "features.19.weight", "features.19.bias", "features.21.weight", "features.21.bias", "features.24.weight", "features.24.bias", "features.26.weight", "features.26.bias", "features.28.weight", "features.28.bias", "classifier.0.weight", "classifier.0.bias", "classifier.3.weight", "classifier.3.bias", "classifier.6.weight", "classifier.6.bias".
Unexpected key(s) in state_dict: "state_dict", "optimizer_state_dict", "globalStep", "train_paths", "test_paths".
I am following instruction available at this site: https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-across-devices
Many Thanks
import argparse
import datetime
import glob
import os
import random
import shutil
import time
from os.path import join
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision.transforms import ToTensor
from tqdm import tqdm
import torch.optim as optim
from convnet3 import Convnet
from dataset2 import CellsDataset
from convnet3 import Convnet
from VGG import VGG
from dataset2 import CellsDataset
from torchvision import models
from Conv import Conv2d
parser = argparse.ArgumentParser('Predicting hits from pixels')
parser.add_argument('name',type=str,help='Name of experiment')
parser.add_argument('data_dir',type=str,help='Path to data directory containing images and gt.csv')
parser.add_argument('--weight_decay',type=float,default=0.0,help='Weight decay coefficient (something like 10^-5)')
parser.add_argument('--lr',type=float,default=0.0001,help='Learning rate')
args = parser.parse_args()
metadata = pd.read_csv(join(args.data_dir,'gt.csv'))
metadata.set_index('filename', inplace=True)
# create datasets:
dataset = CellsDataset(args.data_dir,transform=ToTensor(),return_filenames=True)
dataset = DataLoader(dataset,num_workers=4,pin_memory=True)
model_path = '/Users/nubstech/Documents/GitHub/CellCountingDirectCount/VGG_model_V1/checkpoints/checkpoint.pth'
class VGG(nn.Module):
def __init__(self, pretrained=True):
super(VGG, self).__init__()
vgg = models.vgg16(pretrained=pretrained)
# if pretrained:
vgg.load_state_dict(torch.load(model_path))
features = list(vgg.features.children())
self.features4 = nn.Sequential(*features[0:23])
self.de_pred = nn.Sequential(Conv2d(512, 128, 1, same_padding=True, NL='relu'),
Conv2d(128, 1, 1, same_padding=True, NL='relu'))
def forward(self, x):
x = self.features4(x)
x = self.de_pred(x)
return x
model=VGG()
#model.load_state_dict(torch.load(model_path),strict=False)
model.eval()
#optimizer = torch.optim.Adam(model.parameters(),lr=args.lr,weight_decay=args.weight_decay)
for images, paths in tqdm(dataset):
targets = torch.tensor([metadata['count'][os.path.split(path)[-1]] for path in paths]) # B
targets = targets.float()
# code to print training data to a csv file
#filename=CellsDataset(args.data_dir,transform=ToTensor(),return_filenames=True)
output = model(images) # B x 1 x 9 x 9 (analogous to a heatmap)
preds = output.sum(dim=[1,2,3]) # predicted cell counts (vector of length B)
print(preds)
paths_test = np.array([paths])
names_preds = np.hstack(paths)
print(names_preds)
df=pd.DataFrame({'Image_Name':names_preds, 'Target':targets.detach(), 'Prediction':preds.detach()})
print(df)
# save image name, targets, and predictions
df.to_csv(r'model.csv', index=False, mode='a')
Code for saving the state dict
torch.save({'state_dict':model.state_dict(),
'optimizer_state_dict':optimizer.state_dict(),
'globalStep':global_step,
'train_paths':dataset_train.files,
'test_paths':dataset_test.files},checkpoint_path)
The problem is that what is being saved is not the same as what is expected to be loaded. The code is trying to load only a state_dict; it is saving quite a bit more than that - looks like a state_dict inside another dict with additional info. The load method doesn't have any logic to look inside the dict.
This should work:
import torch, torchvision.models
model = torchvision.models.vgg16()
path = 'test.pth'
torch.save(model.state_dict(), path) # nothing else here
model.load_state_dict(torch.load(path))

Is it correct way to make a Dictionary by using Multiple Image?

I have written a code to make a Dictionary by using Multiple Image.
Here is my code =>
from time import time
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from sklearn.decomposition import MiniBatchDictionaryLearning
from sklearn.feature_extraction.image import extract_patches_2d
from sklearn.feature_extraction.image import reconstruct_from_patches_2d
from sklearn.utils.fixes import sp_version
from sklearn.datasets import load_sample_image
from scipy import ndimage
from skimage import color
from skimage import io
from PIL import Image
from sklearn.decomposition import SparseCoder
from sklearn.decomposition import sparse_encode
from scipy.misc import imfilter, imread
from scipy.signal import convolve2d as conv2
from skimage import data, img_as_float
from scipy import ndimage as ndi
from skimage import feature
from scipy.misc import imsave
c = np.asarray(Image.open('047.jpg').convert('L').resize((512,512), Image.ANTIALIAS))
d = np.asarray(Image.open('048.jpg').convert('L').resize((512,512), Image.ANTIALIAS))
e = np.asarray(Image.open('049.jpg').convert('L').resize((512,512), Image.ANTIALIAS))
f = np.asarray(Image.open('046.jpg').convert('L').resize((512,512), Image.ANTIALIAS))
g = np.asarray(Image.open('038.jpg').convert('L').resize((512,512), Image.ANTIALIAS))
h = np.asarray(Image.open('039.jpg').convert('L').resize((512,512), Image.ANTIALIAS))
n0 = np.asarray(Image.open('037.jpg').convert('L').resize((512,512), Image.ANTIALIAS))
n0 = n0 / 255
height, width = n0.shape
n0 = n0 + 0.075 * np.random.randn(height, width)
imsave('noise.png',n0)
patchsize = (8,8)
t0 = time()
data1 = extract_patches_2d(c,(8,8))
data2 = extract_patches_2d(d,(8,8))
data3 = extract_patches_2d(e,(8,8))
data4 = extract_patches_2d(f,(8,8))
data5 = extract_patches_2d(g,(8,8))
data6 = extract_patches_2d(h,(8,8))
data = np.append(data1,data2,axis=0)
data = np.append(data,data3,axis=0)
data = np.append(data,data4,axis=0)
data = np.append(data,data5,axis=0)
data = np.append(data,data6,axis=0)
data = data.reshape(data.shape[0], -1)
print('Extract patch shape :',data.shape)
data = data - np.mean(data, axis=0)
data = data / np.std(data, axis=0)
t1 = time()
print('Total time : ',round((t1-t0),2),' sec')
print('Learning the dictionary ....')
t2 = time()
n_iter = 1000
dico = MiniBatchDictionaryLearning(n_components=100,alpha=3,n_iter=n_iter)
V = dico.fit(data).components_
Actually I want to train a well learned Dictionary that will help me to denoise an Image.
Is it correct way to train a dictionary By using Multiple Image?

Resources