UnicodeDecodeError happen with .p file(pickle) - parsing

I gave path of .p file and try to load the file.
But this error happened
"UnicodeDecodeError: 'ascii' codec can't decode byte 0xf0 in position 0: ordinal not in range(128)"
def main(params):
# load the checkpoint
checkpoint_path = params['checkpoint_path']
print ('loading checkpoint %s' % (checkpoint_path, ))
#with open(checkpoint_path, 'rb') as pickle_file:
# checkpoint = pickle.load(pickle_file)
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-checkpoint_path', default="D:\\neuraltalk\\cv\\flickr8k_cnn_lstm_v1.p", type=str, help='the input checkpoint')
parser.add_argument('-r', '--root_path', default='D:\intermediate-cnn-features\images', type=str, help='folder with the images, tasks.txt file, and corresponding vgg_feats.mat file')
parser.add_argument('-b', '--beam_size', type=int, default=1, help='beam size in inference. 1 indicates greedy per-word max procedure. Good value is approx 20 or so, and more = better.')
args = parser.parse_args()
params = vars(args) # convert to ordinary dict
print ('parsed parameters:')
print (json.dumps(params, indent = 2))
main(params)
i tried to fix it doing like this.
with open(checkpoint_path, 'rb', encoding='utf-8') as pickle_file:
checkpoint = pickle.load(pickle_file)
But the other error happend..
"ValueError: binary mode doesn't take an encoding argument"
what should i do?

pickle is a binary protocol, so use binary mode. As your error says, "binary mode doesn't take an encoding argument":
with open(checkpoint_path, 'rb') as pickle_file:
checkpoint = pickle.load(pickle_file)
Your other error "UnicodeDecodeError", would only happen in text mode. Make sure to use 'rb'. If you still have issues, update your question with a reproducible example, along with sample input and a full traceback of the error message.
Full demo:
import pickle
data = {'key':[1,2,3], 'key2':[4,5,6]}
with open('out.p','wb') as f:
pickle.dump(data,f)
with open('out.p','rb') as f:
data2 = pickle.load(f)
print(data == data2)
True

I found the answer my own.
You can fix this problem by adding this code
with open(checkpoint_path, 'rb') as pickle_file:
checkpoint = pickle.load(pickle_file, encoding='latin1')

Related

How to save sentence-Bert output vectors to a file?

I am using Bert to get similarity between multi term words.here is my code that I used for embedding :
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-large-uncased-whole-word-masking')
words = [
"Artificial intelligence",
"Data mining",
"Political history",
"Literature book"]
I also have a dataset which contains 540000 other words.
Vocabs = [
"Winter flooding",
"Cholesterol diet", ....]
the problem is when I want to embed Vocabs to vectors it takes time forever.
words_embeddings = model.encode(words)
Vocabs_embeddings = model.encode(Vocabs)
is there any way to make it faster? or I want to embed Vocabs in for loops and save the output vectors in a file so I don't have to embed 540000 vocabs every time I need it. is there a way to save embeddings to a file and use it again?
I will really appreciate you for your time trying help me.
You can pickle your corpus and embeddings like this, you can also pickle a dictionary instead, or write them to file in any other format you prefer.
import pickle
with open("my-embeddings.pkl", "wb") as fOut:
pickle.dump({'sentences': words, 'embeddings': word_embeddings},fOut)
Or more generally like below, so you encode when the embeddings don't exist but after that any time you need them you load from file, instead of re-encoding your corpus:
if not os.path.exists(embedding_cache_path):
# read your corpus etc
corpus_sentences = ...
print("Encoding the corpus. This might take a while")
corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_numpy=True)
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
print("Storing file on disc")
with open(embedding_cache_path, "wb") as fOut:
pickle.dump({'sentences': corpus_sentences, 'embeddings': corpus_embeddings}, fOut)
else:
print("Loading pre-computed embeddings from disc")
with open(embedding_cache_path, "rb") as fIn:
cache_data = pickle.load(fIn)
corpus_sentences = cache_data['sentences']
corpus_embeddings = cache_data['embeddings']

Anyone knows what is in Skimage TIfffile save, unknown error "type b".?

I am getting a strange error saving a tiff file (stack grayscale), any idea?:
File
"C:\Users\ptyimg_np.MT00200169\Anaconda3\lib\site-packages\tifffile\tifffile.py",
line 1241, in save
sampleformat = {'u': 1, 'i': 2, 'f': 3, 'c': 6}[datadtype.kind] KeyError: 'b'
my code is
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from skimage.morphology import watershed
from skimage.feature import peak_local_max
from scipy import ndimage
from skimage import img_as_float
from skimage import exposure,io
from skimage import external
from skimage.color import rgb2gray
from skimage.filters import threshold_local , threshold_niblack
import numpy as np
import tifffile
from joblib import Parallel, delayed
import sys
# Load an example image
input_namefile = sys.argv[1]
output_namefile = 'seg_'+ input_namefile
#Settings
block_size = 25 #Size block of the local thresholding
img = io.imread(input_namefile, plugin='tifffile')
thresh = threshold_niblack(img, window_size=block_size , k=0.8) #
res = img > thresh
res = np.asanyarray(res)
print("saving segmentation")
tifffile.imsave(output_namefile, res , photometric='minisblack' )
It looks like the error is caused by a bug in writing boolean images in your installed version of tifffile. However, the bug has been fixed in more recent versions (I have 2020.2.16 in my current environment). On my machine, this works fine:
import numpy as np
import tifffile
tifffile.imsave('test.tiff', np.random.random((10, 10)) > 0.5)
and the line causing a crash in your version is never executed in the case of a boolean image.
So, long story short, use python -m pip install -U tifffile to upgrade your version of tifffile, and your program should work!
Some analysis first. The offending line:
sampleformat = {'u': 1, 'i': 2, 'f': 3, 'c': 6}[datadtype.kind]
is causing a KeyError exception because the value of datadtype.kind (the NumPy datatype) is set to b and there is no b in that dictionary. It only caters for types i, u, f, and c (respectively, signed integer, unsigned integer, floating-point, and complex floating-point). Type b is boolean.
This looks like a bug in the code that you're using. If it's something that's not supported, the code should really catch the exception and report on it in a more user-friendly manner rather than just dumping an exception for you to figure out.
My advice is to raise this as a bug with the author.
In terms of the root cause of the issue (this is speculation based on analysis, so could be wrong, I'm just providing it as a possible cause), an examination of your code shows:
img = io.imread(input_namefile, plugin='tifffile')
thresh = threshold_niblack(img, window_size=block_size , k=0.8) #
res = img > thresh
res = np.asanyarray(res)
tifffile.imsave(output_namefile, res , photometric='minisblack' )
That third line above will set res to a either a boolean value or a boolean array that depends on the respective values of each pixel in img and thresh (I don't know enough about NumPy to pontificate on this).
However, regardless of that, they are one or more booleans so, when you try to write them with the imsave() call, it complains about the type being used (as mentioned above, it appears to not cater for boolean values correrctly).
Based on some sample code found elsewhere:
image = data.coins()
mask = image > 128
masked_image = image * mask
I suspect that you should use something similar to that last line to apply the mask to the image, then write the resultant value:
img = io.imread(input_namefile, plugin='tifffile')
thresh = threshold_niblack(img, window_size=block_size , k=0.8)
mask = image > 128 # <-- unsure if this is needed.
res = img * thresh # <-- add this line.
res = np.asanyarray(res)
tifffile.imsave(output_namefile, res , photometric='minisblack' )
Applying the mask to the original image should give you an array of usable values that you can write back out to an image file. Note that I'm unsure whether you need the res > thresh line since it appears to me that the threshold already gives you a mask. I could be wrong on that point so my advice is still to raise it with the author.

Getting an error while converting Tibble to h2o hex file

I am running the h2o package in Rstudio, I am getting an error while converting Tibble into h2o.
Below is my code
#Augment Time Series Signature
PO_Data_aug = PO_Data %>%
tk_augment_timeseries_signature()
PO_Data_aug
# Split into training, validation and test sets
train_tbl = PO_Data_aug %>% filter(Date <= '2017-12-29')
valid_tbl = PO_Data_aug %>% filter(Date>'2017-12-29'& Date <='2018-03-31')
test_tbl = PO_Data_aug %>% filter(Date > '2018-03-31')
str(train_tbl)
train_tbl$month.lbl<-as.character(train_tbl$month.lbl)
h2o.init() # Fire up h2o
##hex
train_h2o = as.h2o(train_tbl)
valid_h2o = as.h2o(valid_tbl)
test_h2o = as.h2o(test_tbl)
ERROR: Unexpected HTTP Status code: 412 Precondition Failed (url = http://localhost:54321/3/Parse)
ERROR MESSAGE:
Provided column type ordered is unknown. Cannot proceed with parse due to invalid argument.
Kindly Suggest
This is actually a bug in H2O -- it has nothing to do with tibbles. There is no support for the "ordered" column type in data.frames or tibbles. We will fix this (ticket here).
The work-around right now is to manually convert your "ordered" columns into un-ordered "factor" columns.
tb <- tibble(x = ordered(c(1,2,3)), y = 1:3)
tb$x <- factor(tb$x, ordered = FALSE)
hf <- as.h2o(tb)
as.h2o() expects an R dataframe. You could use an R dataframe instead of your tibble dataframe or as Tom mentioned in the comments you could use one of the supported file formats for H2O.
train_h2o = as.h2o(as_data_frame(train_tbl))
valid_h2o = as.h2o(as_data_frame(valid_tbl))
test_h2o = as.h2o(as_data_frame(test_tbl))

Generating words from trained RNN model: "Variable already exists, disallowed. Did you mean to set reuse=True in VarScope? "

So I implemented a RNN word generator model in jupytor notebook.
When I was trying to use the trained model to generate some words:
with open(os.path.join(cfgs['save_dir'], 'config.pkl'), 'rb') as f:
saved_args = cPickle.load(f)
with open(os.path.join(cfgs['save_dir'], 'words_vocab.pkl'), 'rb') as f:
words, vocab = cPickle.load(f)
with tf.Session() as sess:
model = Model(saved_args, True)
tf.global_variables_initializer().run()
saver = tf.train.Saver(tf.global_variables())
ckpt = tf.train.get_checkpoint_state(cfgs['save_dir'])
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
print(model.sample(sess, words, vocab, cfgs['n'], cfgs['prime'], cfgs['sample'], cfgs['pick'], cfgs['width']))
It works for the first time, but if I run the code again there is an error:
ValueError: Variable rnnlm/softmax_w already exists, disallowed. Did you mean to set reuse=True in VarScope?
Right now I have to shut down the ipynb file then run the code to get a new sample.
How to change the code to avoid this situation?
You can call the model.sample function multiple times without a problem but everything else (creating the session, constructing the Model, loading the checkpoint) should only be run once. If you refactor your code then you won't see that error message anymore.

Labeling Images using Inception Getting ValueError: GraphDef cannot be larger than 2GB

I am using the TensorFlow for Poets code lab to guide me as I retrain the Inceptionv3 CNN to classify a list of images. I have successfully trained the model, and it works when i employ the given code to classify individual images. But when i try and use it on a large batch of images, then i get the GraphDef cannot be larger than 2GB. Please advise.
import pandas as pd
import os, sys
import tensorflow as tf
test_images = pd.read_csv('test_images.csv')
testid = test_images['Id']
listx= list(range(4320))
predlist=[]
output = pd.DataFrame({'Id': listx})
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
for x in listx:
path = 'test/'+str(x+1)+'.jpg'
# change this as you see fit
image_path = path
# Read in the image_data
image_data = tf.gfile.FastGFile(image_path, 'rb').read()
# Loads label file, strips off carriage return
label_lines = [line.rstrip() for line
in tf.gfile.GFile("retrained_labels.txt")]
# Unpersists graph from file
with tf.gfile.FastGFile("retrained_graph.pb", 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
tf.import_graph_def(graph_def, name='')
with tf.Session() as sess:
# Feed the image_data as input to the graph and get first prediction
with tf.Graph().as_default():
softmax_tensor = sess.graph.get_tensor_by_name('final_result:0')
predictions = sess.run(softmax_tensor, \
{'DecodeJpeg/contents:0': image_data})
# Sort to show labels of first prediction in order of confidence
top_k = predictions[0].argsort()[-len(predictions[0]):][::-1]
# print('the top result is' + label_lines[node_id])
flag = 0
for node_id in top_k:
while flag == 0:
human_string = label_lines[node_id]
score = predictions[0][node_id]
predlist.append(int(human_string[:3]))
print('%s' % (human_string))
flag = 1 # we only want the top prediction
output['Prediction']=predlist
output.to_csv('outputtest.csv')
One way by which this error can e solved is by placing
with tf.Graph().as_default():
after for loop.
This is the piece of code that worked for me while trying to read bulk image:
for filename in os.listdir(image_path):
with tf.Graph().as_default():
# Read in the image_data
image_data = tf.gfile.FastGFile(image_path + filename, 'rb').read()

Resources