Deploy pytorch model on webcam - image-processing

I am trying to deploy PyTorch classifier on webcam, but always getting errors, mostly "AttributeError: 'collections.OrderedDict' object has no attribute 'load_state_dict'". The classifier is a binary classifier. Saved the model as .pt file.
Hope for your support to resolve the issue.
Here are the codes I am using:
import numpy as np
import torch
import torch.nn
import torchvision
from torch.autograd import Variable
from torchvision import transforms
import PIL
import cv2
#This is the Label
Labels = { 0 : 'Perfect',
1 : 'Defected'
}
# Let's preprocess the inputted frame
data_transforms = torchvision.transforms.Compose([
torchvision.transforms.Resize(size=(224, 224)),
torchvision.transforms.RandomHorizontalFlip(),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") ##Assigning the Device which will do the calculation
model = torch.load("defect_classifier.pt") #Load model to CPU
model.load_state_dict(torch.load("defect_classifier.pt"))
model = model.to(device) #set where to run the model and matrix calculation
model.eval() #set the device to eval() mode for testing
#Set the Webcam
def Webcam_720p():
cap.set(3,1280)
cap.set(4,720)
def argmax(prediction):
prediction = prediction.cpu()
prediction = prediction.detach().numpy()
top_1 = np.argmax(prediction, axis=1)
score = np.amax(prediction)
score = '{:6f}'.format(score)
prediction = top_1[0]
result = Labels[prediction]
return result,score
def preprocess(image):
image = PIL.Image.fromarray(image) #Webcam frames are numpy array format
#Therefore transform back to PIL image
print(image)
image = data_transforms(image)
image = image.float()
#image = Variable(image, requires_autograd=True)
image = image.cuda()
image = image.unsqueeze(0) #I don't know for sure but Resnet-50 model seems to only
#accpets 4-D Vector Tensor so we need to squeeze another
return image #dimension out of our 3-D vector Tensor
#Let's start the real-time classification process!
cap = cv2.VideoCapture(0) #Set the webcam
Webcam_720p()
fps = 0
show_score = 0
show_res = 'Nothing'
sequence = 0
while True:
ret, frame = cap.read() #Capture each frame
if fps == 4:
image = frame[100:450,150:570]
image_data = preprocess(image)
print(image_data)
prediction = model(image_data)
result,score = argmax(prediction)
fps = 0
if result >= 0.5:
show_res = result
show_score= score
else:
show_res = "Nothing"
show_score = score
fps += 1
cv2.putText(frame, '%s' %(show_res),(950,250), cv2.FONT_HERSHEY_SIMPLEX, 2, (255,255,255), 3)
cv2.putText(frame, '(score = %.5f)' %(show_score), (950,300), cv2.FONT_HERSHEY_SIMPLEX, 1,(255,255,255),2)
cv2.rectangle(frame,(400,150),(900,550), (250,0,0), 2)
cv2.imshow("ASL SIGN DETECTER", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyWindow("ASL SIGN DETECTER")

Dont use this line
model = torch.load("defect_classifier.pt")
instead use model = Your_model_class() , since your model is object of your model class

Related

Why does Mean-Shift Clustering return 1 for anomaly detection in images? Which paramterers are important for this algorithm?

I want to implement an anomaly detection algorithm for images and used K-Means Clustering there the number of clusters are not known. I tried to use Elbow Methods but its very difficult for me to anaylze and I also used Mean Shift Algorithm but it returns 1 for the number of clusters. What kind of algorithm should I use then? or is it good idea to use mean shift algorithm for 5000 images?
Here is the code:
import random, cv2, os, sys, shutil
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
import keras
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets import make_blobs
class img_clustering:
def __init__(self, folder_path="data", max_examples=None):
paths = os.listdir(folder_path)
if max_examples == None:
self.max_examples = len(paths)
else:
if max_examples > len(paths):
self.max_examples = len(paths)
else:
self.max_examples = max_examples
self.n_clusters = n_clusters
self.folder_path = folder_path
random.shuffle(paths)
self.image_paths = paths[:self.max_examples]
self.use_imagenets = use_imagenets
def load_images(self):
self.images = []
for image in self.image_paths:
img = cv2.imread(self.folder_path + "/" + image)
img = cv2.resize(img,(224,224))
self.images.append(img)
self.images = np.float32(self.images)#
self.images /= 255
def get_new_imagevectors(self):
model1 = keras.applications.MobileNetV2(input_shape=(224,224,3), alpha=1.0, include_top=False, weights='imagenet', pooling=None)
model1.summary()
pred = model1.predict(self.images)
images_temp = pred.reshape(self.images.shape[0], -1)
model2 = PCA(n_components=None, random_state=728)
model2.fit(images_temp)
self.images_new = model2
def clustering(self):
X, _ = make_blobs(n_samples=10000, centers=self.images_new, cluster_std=0.6)
bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
model = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True,
n_jobs=None, seeds=None)
model.fit(X)
labels = model.labels_
cluster_centers = model.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
predictions = model.predict(self.images_new)
print("\n")
print("The number of clusters:",n_clusters_)
print("The number of labels_unique:",labels_unique)
print("\n")
if __name__=='__main__':
data_path = "Path" # path of the folder that contains the images
max_examples = 5000
strt = img_clustering(data_path, max_examples)
strt.load_images()
strt.get_new_imagevectors()
strt.clustering()
The elbow method showing optimal k but I don't know what the optimal k numbers can be in this plot.
enter image description here

keras neural network predicts the same number for every handwritten digit

I am new to machine learning so as a first project I've tried to built a handwritten digit recognition neural network based on the mnist dataset and when I test it with the test images provided by the data set itself it seems to work pretty well (that's what the function test_predict is for). Now I would like to step it up and have the network recognise some actual handwritten digits that I've taken photos of.
The function partial_img_rec takes on an image containing multiple digits and it will be called by multiple_digits. I know it might seem weird that I use recursion here and I'm sure there are some more efficient ways to do this but that's not the matter. In order to test partial_img_rec I provide some photos of individual digits that are stored in the folder .\individual_test and they all look something like this:
The problem is: My neural network's prediction for every single one of my test images is "5". The probability is always about 22% no matter the actual digit displayed. I totally get why the results are not as great as those achieved with the mnist dataset's test images but I certainly didn't expect this. Do you have any idea why this is happening? Any advise is welcome.
Thank you in advance.
Here's my code (edited, now working):
# import keras and the MNIST dataset
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.utils import np_utils
# numpy is necessary since keras uses numpy arrays
import numpy as np
# imports for pictures
from PIL import Image
from PIL import ImageOps
# imports for tests
import random
import os
class mnist_network():
def __init__(self):
""" load data, create and train model """
# load data
(X_train, y_train), (X_test, y_test) = mnist.load_data()
# flatten 28*28 images to a 784 vector for each image
num_pixels = X_train.shape[1] * X_train.shape[2]
X_train = X_train.reshape((X_train.shape[0], num_pixels)).astype('float32')
X_test = X_test.reshape((X_test.shape[0], num_pixels)).astype('float32')
# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255
# one hot encode outputs
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
num_classes = y_test.shape[1]
# create model
self.model = Sequential()
self.model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))
# Compile model
self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# train the model
self.model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=200, verbose=2)
self.train_img = X_train
self.train_res = y_train
self.test_img = X_test
self.test_res = y_test
def test_all(self):
""" evaluates the success rate using all the test data """
scores = self.model.evaluate(self.test_img, self.test_res, verbose=0)
print("Baseline Error: %.2f%%" % (100-scores[1]*100))
def predict_result(self, img, num_pixels = None, show=False):
""" predicts the number in a picture (vector) """
assert type(img) == np.ndarray and img.shape == (784,)
"""if show:
# show the picture!!!! some problem here
plt.imshow(img, cmap='Greys')
plt.show()"""
num_pixels = img.shape[0]
# the actual number
res_number = np.argmax(self.model.predict(img.reshape(-1,num_pixels)), axis = 1)
# the probabilities
res_probabilities = self.model.predict(img.reshape(-1,num_pixels))
return (res_number[0], res_probabilities.tolist()[0]) # we only need the first element since they only have one
def test_predict(self, amount_test = 100):
""" test some random numbers from the test part of the data set """
assert type(amount_test) == int and amount_test <= 10000
cnt_right = 0
cnt_wrong = 0
for i in range(amount_test):
ind = random.randrange(0,10000) # there are 10000 images in the test part of the data set
""" correct_res is the actual result stored in the data set
It's represented as a list of 10 elements one of which being 1, the rest 0 """
correct_list = self.test_res.tolist()
correct_list = correct_list[ind] # the correct sublist
correct_res = correct_list.index(1.0)
predicted_res = self.predict_result(self.test_img[ind])[0]
if correct_res != predicted_res:
cnt_wrong += 1
print("Error in predict ! \
index = ", ind, " predicted result = ", predicted_res, " correct result = ", correct_res)
else:
cnt_right += 1
print("The machine predicted correctly ",cnt_right," out of ",amount_test," examples. That is a success rate of ", (cnt_right/amount_test)*100,"%.")
def partial_img_rec(self, image, upper_left, lower_right, results=[]):
""" partial is a part of an image """
left_x, left_y = upper_left
right_x, right_y = lower_right
print("current test part: ", upper_left, lower_right)
print("results: ", results)
# condition to stop recursion: we've reached the full width of the picture
width, height = image.size
if right_x > width:
return results
partial = image.crop((left_x, left_y, right_x, right_y))
# rescale image to 28 *28 dimension
partial = partial.resize((28,28), Image.ANTIALIAS)
partial.show()
# transform to vector
partial = ImageOps.invert(partial)
partial = np.asarray(partial, "float32")
partial = partial / 255.
partial[partial < 0.5] = 0.
# flatten image to 28*28 = 784 vector
num_pixels = partial.shape[0] * partial.shape[1]
partial = partial.reshape(num_pixels)
step = height // 10
# is there a number in this part of the image?
res, prop = self.predict_result(partial)
print("result: ", res, ". probabilities: ", prop)
# only count this result if the network is >= 50% sure
if prop[res] >= 0.5:
results.append(res)
# step is 80% of the partial image's size (which is equivalent to the original image's height)
step = int(height * 0.8)
print("found valid result")
else:
# if there is no number found we take smaller steps
step = height // 20
print("step: ", step)
# recursive call with modified positions ( move on step variables )
return self.partial_img_rec(image, (left_x+step, left_y), (right_x+step, right_y), results=results)
def test_individual_digits(self):
""" test partial_img_rec with some individual digits (square shaped images)
saved in the folder 'individual_test' following the pattern 'number_digit.jpg' """
cnt_right, cnt_wrong = 0,0
folder_content = os.listdir(".\individual_test")
for imageName in folder_content:
# image file must be a jpg or png
assert imageName[-4:] == ".jpg" or imageName[-4:] == ".png"
correct_res = int(imageName[0])
image = Image.open(".\\individual_test\\" + imageName).convert("L")
# only square images in this test
if image.size[0] != image.size[1]:
print(imageName, " has the wrong proportions: ", image.size,". It has to be a square.")
continue
predicted_res = self.partial_img_rec(image, (0,0), (image.size[0], image.size[1]), results=[])
if predicted_res == []:
print("No prediction possible for ", imageName)
else:
predicted_res = predicted_res[0]
if predicted_res != correct_res:
print("error in partial_img-rec! Predicted ", predicted_res, ". The correct result would have been ", correct_res)
cnt_wrong += 1
else:
cnt_right += 1
print("correctly predicted ",imageName)
print(cnt_right, " out of ", cnt_right + cnt_wrong," digits were correctly recognised. The success rate is therefore ", (cnt_right / (cnt_right + cnt_wrong)) * 100," %.")
def multiple_digits(self, img):
""" takes as input an image without unnecessary whitespace surrounding the digits """
#assert type(img) == myImage
width, height = img.size
# start with the first quadratic part of the image
res_list = self.partial_img_rec(img, (0,0),(height ,height))
res_str =""
for elem in res_list:
res_str += str(elem)
return res_str
network = mnist_network()
network.test_individual_digits()
EDIT
#Geecode's answer was very helpful and the network now predicts correctly some of the pictures including the one shown above. Yet the overall success rate is lower than 50%. Do you have any ideas how to improve this?
Examples for images returning bad results:
Nothing wrong with your image in itself, your model can correctly classify it.
The issue is that you made a Floor Division on your partial:
partial = partial // 255
which always results in 0. So you always get a black image.
You have to do a "normal" division and some preparation, because your model was trained on black i.e. 0. valued pixel backgrounded negative images:
# transform to vector
partial = ImageOps.invert(partial)
partial = np.asarray(partial, "float32")
partial = partial / 255.
partial[partial < 0.5] = 0.
After then your model will classify correctly:
Out:
result: 1 . probabilities: [0.000431705528171733, 0.7594985961914062, 0.0011404436081647873, 0.00018972357793245465, 0.03162384033203125, 0.008697531186044216, 0.0014472954208031297, 0.18429973721504211, 0.006838776171207428, 0.005832481198012829]
found valid result
Note, that of course you can play on the image preparation yet, that was not the purpose of this answer.
Update:
My detailed answer regarding how to achive better performance in this task, see here.

How to count vehicles using opencv in python?

I am working on a VCS (vehicle counting system) project. The scope of the project is to classify and count vehicles. I have built a custom model using Faster-RCNN in Tensorflow-object-detection-API This model only contains 7 classes such as car motorbike, bicycle and etc. The model works perfectly, But, the problem is "COUNTING". It is very hard to count vehicles in video frame. I did a pre-research on the internet. I tried a lot. but i could not find any useful information. There are some projects on github, they use tracking methods.
I want the following things. I want to draw an horizontal line in the frame. when the vehicle touch it, the counting should take place. How to do it. I don't know the algorithm behind it. I heard that centroid tracking would help me.
My question is, i want to count vehicles when it touch the horizontal line. I have linked a sample image bellow.
Sample_Image
import os
import cv2
import numpy as np
import tensorflow as tf
import sys
# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")
# Import utilites
from utils import label_map_util
from utils import visualization_utils as vis_util
# Name of the directory containing the object detection module we're using
MODEL_NAME = 'inference_graph'
VIDEO_NAME = 'Video_105.mp4'
# Grab path to current working directory
CWD_PATH = os.getcwd()
# Path to frozen detection graph .pb file, which contains the model that is used
# for object detection.
PATH_TO_CKPT = os.path.join(CWD_PATH,MODEL_NAME,'frozen_inference_graph.pb')
# Path to label map file
PATH_TO_LABELS = os.path.join(CWD_PATH,'training','labelmap.pbtxt')
# Path to video
PATH_TO_VIDEO = os.path.join(CWD_PATH,VIDEO_NAME)
# Number of classes the object detector can identify
NUM_CLASSES = 7
# Load the label map.
# Label maps map indices to category names, so that when our convolution
# network predicts `5`, we know that this corresponds to `king`.
# Here we use internal utility functions, but anything that returns a
# dictionary mapping integers to appropriate string labels would be fine
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# Load the Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=detection_graph)
# Define input and output tensors (i.e. data) for the object detection classifier
# Input tensor is the image
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Output tensors are the detection boxes, scores, and classes
# Each box represents a part of the image where a particular object was detected
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represents level of confidence for each of the objects.
# The score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
# Number of objects detected
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Open video file
video = cv2.VideoCapture(PATH_TO_VIDEO)
while(video.isOpened()):
# Acquire frame and expand frame dimensions to have shape: [1, None, None, 3]
# i.e. a single-column array, where each item in the column has the pixel RGB value
ret, frame = video.read()
frame_expanded = np.expand_dims(frame, axis=0)
# Perform the actual detection by running the model with the image as input
(boxes, scores, classes, num) = sess.run(
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: frame_expanded})
# Draw the results of the detection (aka 'visulaize the results')
vis_util.visualize_boxes_and_labels_on_image_array(
frame,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=8,
min_score_thresh=0.90)
# All the results have been drawn on the frame, so it's time to display it.
final_score = np.squeeze(scores)
count = 0
cv2.line(frame, (1144, 568), (1723,664), (0,0,255), 2) #Line
for i in range(100):
if scores is None or final_score[i] > 0.90:
min_score_thresh = 0.90
bboxes = boxes[scores > min_score_thresh]
im_width = video.get(cv2.CAP_PROP_FRAME_WIDTH)
im_height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
final_box = []
for box in bboxes:
ymin, xmin, ymax, xmax = box
print("Ymin:{}:Xmin:{}:Ymax:{}Xmax{}".format(ymin*im_width,xmin*im_width,ymax*im_width,xmax*im_width))
final_box.append([xmin * im_width, xmax * im_width, ymin * im_height, ymax * im_height])
#print(final_box)
cv2.imshow('Object detector', frame)
# Press 'q' to quit
if cv2.waitKey(1) == ord('q'):
break
# Clean up
video.release()
cv2.destroyAllWindows()
# import the necessary packages
from imutils.video import VideoStream
from imutils.video import FPS
import argparse
import imutils
import time
import cv2
tracker = cv2.TrackerCSRT_create()
vs = cv2.VideoCapture("Video.mp4")
initBB = None
detec = []
def pega_centro(x, y, w, h):
x1 = int(w / 2)
y1 = int(h / 2)
cx = x + x1
cy = y + y1
return cx,cy
roi = 480
counter = 0
offset = 6
# loop over frames from the video stream
while vs.isOpened():
ret,frame = vs.read()
cv2.line(frame, (769 , roi), (1298 , roi), (255,0,0), 3)
# check to see if we are currently tracking an object
if initBB is not None:
# grab the new bounding box coordinates of the object
(success, box) = tracker.update(frame)
# check to see if the tracking was a success
if success:
(x, y, w, h) = [int(v) for v in box]
cv2.rectangle(frame, (x, y), (x + w, y + h),
(0, 255, 0), 2)
cX = int((x + x+w) / 2.0)
cY = int((y + y+h) / 2.0)
cv2.circle(frame, (cX, cY), 3, (0, 0, 255), -1)
c=pega_centro(x, y, w, h)
detec.append(c)
for (x,y) in detec:
if y<(roi+offset) and y>(roi-offset):
counter+=1
print(counter)
cv2.line(frame, (769 , roi), (1298 , roi), (0,0,255), 3)
detec.remove((x,y))
# show the output frame
cv2.imshow("Frame", frame)
key = cv2.waitKey(1) & 0xFF
if key == ord("s"):
# select the bounding box of the object we want to track (make
# sure you press ENTER or SPACE after selecting the ROI)
initBB = cv2.selectROI("Frame", frame, fromCenter=False,
showCrosshair=True)
# start OpenCV object tracker using the supplied bounding box
# coordinates, then start the FPS throughput estimator as well
tracker.init(frame, initBB)
fps = FPS().start()
# if the `q` key was pressed, break from the loop
elif key == ord("q"):
break
else:
vs.release()
cv2.destroyAllWindows()

Facing issue while converting YOLOV3 Pytorch model to coreml to be in IOS App using ONNX

Model I want to convert: https://github.com/ultralytics/yolov3
I am trying to convert this pytorch yolov3 model to coreML and for that I have used ONNX which is used to convert model from one platform to another.
It is converting the model and when I run it in xcode I can see that it its inputs and outputs are different and it is not detecting any object and not showing any rectangle on the screen.
I have already tried this tutorial and followed the similar steps but this time for YOLOV3.
google doc: https://drive.google.com/drive/folders/1uxgUBemJVw9wZsdpboYbzUN4bcRhsuAI
I also checked that Yolov3 providing the "onnx" file themselves as well, So I even tried that file to use it and convert it to coreML but still it is not detecting the objects and giving the wrong input/output values.
Convert Pytroch to ONNX
File named "yoloToOnnx.py" content:
import argparse
import os
import sys
import time
import re
import numpy as np
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
import torch.onnx
import utils
from transformer_net import TransformerNet
from vgg import Vgg16
def check_paths(args):
try:
if not os.path.exists(args.save_model_dir):
os.makedirs(args.save_model_dir)
if args.checkpoint_model_dir is not None and not (os.path.exists(args.checkpoint_model_dir)):
os.makedirs(args.checkpoint_model_dir)
except OSError as e:
print(e)
sys.exit(1)
def train(args):
device = torch.device("cuda" if args.cuda else "cpu")
np.random.seed(args.seed)
torch.manual_seed(args.seed)
transform = transforms.Compose([
transforms.Resize(args.image_size),
transforms.CenterCrop(args.image_size),
transforms.ToTensor(),
transforms.Lambda(lambda x: x.mul(255))
])
train_dataset = datasets.ImageFolder(args.dataset, transform)
train_loader = DataLoader(train_dataset, batch_size=args.batch_size)
transformer = TransformerNet().to(device)
optimizer = Adam(transformer.parameters(), args.lr)
mse_loss = torch.nn.MSELoss()
vgg = Vgg16(requires_grad=False).to(device)
style_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Lambda(lambda x: x.mul(255))
])
style = utils.load_image(args.style_image, size=args.style_size)
style = style_transform(style)
style = style.repeat(args.batch_size, 1, 1, 1).to(device)
features_style = vgg(utils.normalize_batch(style))
gram_style = [utils.gram_matrix(y) for y in features_style]
for e in range(args.epochs):
transformer.train()
agg_content_loss = 0.
agg_style_loss = 0.
count = 0
for batch_id, (x, _) in enumerate(train_loader):
n_batch = len(x)
count += n_batch
optimizer.zero_grad()
x = x.to(device)
y = transformer(x)
y = utils.normalize_batch(y)
x = utils.normalize_batch(x)
features_y = vgg(y)
features_x = vgg(x)
content_loss = args.content_weight * mse_loss(features_y.relu2_2, features_x.relu2_2)
style_loss = 0.
for ft_y, gm_s in zip(features_y, gram_style):
gm_y = utils.gram_matrix(ft_y)
style_loss += mse_loss(gm_y, gm_s[:n_batch, :, :])
style_loss *= args.style_weight
total_loss = content_loss + style_loss
total_loss.backward()
optimizer.step()
agg_content_loss += content_loss.item()
agg_style_loss += style_loss.item()
if (batch_id + 1) % args.log_interval == 0:
mesg = "{}\tEpoch {}:\t[{}/{}]\tcontent: {:.6f}\tstyle: {:.6f}\ttotal: {:.6f}".format(
time.ctime(), e + 1, count, len(train_dataset),
agg_content_loss / (batch_id + 1),
agg_style_loss / (batch_id + 1),
(agg_content_loss + agg_style_loss) / (batch_id + 1)
)
print(mesg)
if args.checkpoint_model_dir is not None and (batch_id + 1) % args.checkpoint_interval == 0:
transformer.eval().cpu()
ckpt_model_filename = "ckpt_epoch_" + str(e) + "_batch_id_" + str(batch_id + 1) + ".pth"
ckpt_model_path = os.path.join(args.checkpoint_model_dir, ckpt_model_filename)
torch.save(transformer.state_dict(), ckpt_model_path)
transformer.to(device).train()
# save model
transformer.eval().cpu()
save_model_filename = "epoch_" + str(args.epochs) + "_" + str(time.ctime()).replace(' ', '_') + "_" + str(
args.content_weight) + "_" + str(args.style_weight) + ".model"
save_model_path = os.path.join(args.save_model_dir, save_model_filename)
torch.save(transformer.state_dict(), save_model_path)
print("\nDone, trained model saved at", save_model_path)
def stylize(args):
device = torch.device("cuda" if args.cuda else "cpu")
content_image = utils.load_image(args.content_image, scale=args.content_scale)
content_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Lambda(lambda x: x.mul(255))
])
content_image = content_transform(content_image)
content_image = content_image.unsqueeze(0).to(device)
if args.model.endswith(".onnx"):
output = stylize_onnx_caffe2(content_image, args)
else:
with torch.no_grad():
style_model = TransformerNet()
state_dict = torch.load(args.model)
# remove saved deprecated running_* keys in InstanceNorm from the checkpoint
for k in list(state_dict.keys()):
if re.search(r'in\d+\.running_(mean|var)$', k):
del state_dict[k]
style_model.load_state_dict(state_dict)
style_model.to(device)
if args.export_onnx:
assert args.export_onnx.endswith(".onnx"), "Export model file should end with .onnx"
output = torch.onnx._export(style_model, content_image, args.export_onnx).cpu()
else:
output = style_model(content_image).cpu()
utils.save_image(args.output_image, output[0])
def stylize_onnx_caffe2(content_image, args):
"""
Read ONNX model and run it using Caffe2
"""
assert not args.export_onnx
import onnx
import onnx_caffe2.backend
model = onnx.load(args.model)
prepared_backend = onnx_caffe2.backend.prepare(model, device='CUDA' if args.cuda else 'CPU')
inp = {model.graph.input[0].name: content_image.numpy()}
c2_out = prepared_backend.run(inp)[0]
return torch.from_numpy(c2_out)
def main():
main_arg_parser = argparse.ArgumentParser(description="parser for fast-neural-style")
subparsers = main_arg_parser.add_subparsers(title="subcommands", dest="subcommand")
train_arg_parser = subparsers.add_parser("train", help="parser for training arguments")
train_arg_parser.add_argument("--epochs", type=int, default=2,
help="number of training epochs, default is 2")
train_arg_parser.add_argument("--batch-size", type=int, default=4,
help="batch size for training, default is 4")
train_arg_parser.add_argument("--dataset", type=str, required=True,
help="path to training dataset, the path should point to a folder "
"containing another folder with all the training images")
train_arg_parser.add_argument("--style-image", type=str, default="images/style-images/mosaic.jpg",
help="path to style-image")
train_arg_parser.add_argument("--save-model-dir", type=str, required=True,
help="path to folder where trained model will be saved.")
train_arg_parser.add_argument("--checkpoint-model-dir", type=str, default=None,
help="path to folder where checkpoints of trained models will be saved")
train_arg_parser.add_argument("--image-size", type=int, default=256,
help="size of training images, default is 256 X 256")
train_arg_parser.add_argument("--style-size", type=int, default=None,
help="size of style-image, default is the original size of style image")
train_arg_parser.add_argument("--cuda", type=int, required=True,
help="set it to 1 for running on GPU, 0 for CPU")
train_arg_parser.add_argument("--seed", type=int, default=42,
help="random seed for training")
train_arg_parser.add_argument("--content-weight", type=float, default=1e5,
help="weight for content-loss, default is 1e5")
train_arg_parser.add_argument("--style-weight", type=float, default=1e10,
help="weight for style-loss, default is 1e10")
train_arg_parser.add_argument("--lr", type=float, default=1e-3,
help="learning rate, default is 1e-3")
train_arg_parser.add_argument("--log-interval", type=int, default=500,
help="number of images after which the training loss is logged, default is 500")
train_arg_parser.add_argument("--checkpoint-interval", type=int, default=2000,
help="number of batches after which a checkpoint of the trained model will be created")
eval_arg_parser = subparsers.add_parser("eval", help="parser for evaluation/stylizing arguments")
eval_arg_parser.add_argument("--content-image", type=str, required=True,
help="path to content image you want to stylize")
eval_arg_parser.add_argument("--content-scale", type=float, default=None,
help="factor for scaling down the content image")
eval_arg_parser.add_argument("--output-image", type=str, required=True,
help="path for saving the output image")
eval_arg_parser.add_argument("--model", type=str, required=True,
help="saved model to be used for stylizing the image. If file ends in .pth - PyTorch path is used, if in .onnx - Caffe2 path")
eval_arg_parser.add_argument("--cuda", type=int, required=True,
help="set it to 1 for running on GPU, 0 for CPU")
eval_arg_parser.add_argument("--export_onnx", type=str,
help="export ONNX model to a given file")
args = main_arg_parser.parse_args()
if args.subcommand is None:
print("ERROR: specify either train or eval")
sys.exit(1)
if args.cuda and not torch.cuda.is_available():
print("ERROR: cuda is not available, try running on CPU")
sys.exit(1)
if args.subcommand == "train":
check_paths(args)
train(args)
else:
stylize(args)
if __name__ == "__main__":
main()
Command:
python ./yoloToOnnx.py eval --content-image dummy.jpg --output-image dummy-out.jpg --model ./yolov3.pt --cuda 0 --export_onnx ./yolov3.onnx
Convert ONNX Model to CoreML Models:
File onnx_to_coreml.py content:
import sys
from onnx import onnx_pb
from onnx_coreml import convert
model_in = sys.argv[1]
model_out = sys.argv[2]
model_file = open(model_in, 'rb')
model_proto = onnx_pb.ModelProto()
model_proto.ParseFromString(model_file.read())
coreml_model = convert(model_proto, image_input_names=['0'], image_output_names=['186'])
coreml_model.save(model_out)
Command:
python onnx_to_coreml.py ./yolov3.onnx ./yolov3.mlmodel
I am expecting that after conversion I should be able to use this model in my IOS app which will detect different objects with rectangles drawn along with their names.
The weird thing is while converting it is not throwing any error and it gives compiled successful message but not giving the expected output while using in IOS App.
Usually, pytorch transforms.ToTensor() Converts a PIL Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. Please refer to:
https://pytorch.org/docs/stable/torchvision/transforms.html#torchvision.transforms.ToTensor
As a result, you need preprocess 255 to 1 using preprocessing_args, and depreprocess from 1 to 255 using deprocessing_args. refer to this link to check preprocessing_args.
https://github.com/onnx/onnx-coreml/blob/master/onnx_coreml/converter.py
the formula is red = red * image_scale + red_bias.
I guess the convert tool add a preprocessing layer before input and deprprocessing layer after output.
Here is the reference code for both input and output is image, and there is normalization operation for the image:
import sys
from onnx import onnx_pb
from onnx_coreml import convert
model_in = sys.argv[1]
model_out = sys.argv[2]
model_file = open(model_in, 'rb')
model_proto = onnx_pb.ModelProto()
model_proto.ParseFromString(model_file.read())
coreml_model = convert(model_proto,
image_input_names=['0'],
image_output_names=['186'],
preprocessing_args={'image_scale':1.0/255.0},
deprocessing_args={'image_scale':255.0})
coreml_model.save(model_out)
but for this case, the output is not image, and the input image also do BGR to RGB conversion, so you need more works.

Tensorflow Grid3LSTMCell visualization

I'm having a difficult time visualizing what this Tensorflow class creates. I want to implement a LSTM RNN that handles 3D data.
class Grid3LSTMCell(GridRNNCell):
"""3D BasicLSTM cell
This creates a 2D cell which receives input and gives output in the first dimension.
The first dimension can optionally be non-recurrent if `non_recurrent_fn` is specified.
The second and third dimensions are LSTM.
"""
def __init__(self, num_units, tied=False, non_recurrent_fn=None,
use_peepholes=False, forget_bias=1.0):
super(Grid3LSTMCell, self).__init__(num_units=num_units, num_dims=3,
input_dims=0, output_dims=0, priority_dims=0, tied=tied,
non_recurrent_dims=None if non_recurrent_fn is None else 0,
cell_fn=lambda n, i: rnn_cell.LSTMCell(
num_units=n, input_size=i, forget_bias=forget_bias,
use_peepholes=use_peepholes),
non_recurrent_fn=non_recurrent_fn)
The class is found in `from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell`.
This is difficult to explain, so I've provided a drawing. Here is what I want it to do...
However the comment sounds like it isn't doing this. The comment makes it sound like the RNN is still a flat RNN, where the first dimension is outputting to, what is commonly called, the outputs variable (see below). The second dimension is outputting to the next step in the RNN, and the third dimension is outputting to the next hidden layer.
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
If this is the case, what is the point in having the first and second dimensions? Aren't they essentially the same thing? The BasicLSTMCell sends the output to the next step into outputs -- in other words they are one in the same.
Clarity?
For reference, here is my example code...
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell
import numpy as np
#define parameters
learning_rate = 0.01
batch_size = 2
n_input_x = 10
n_input_y = 10
n_input_z = 10
n_hidden = 128
n_classes = 2
n_output = n_input_x * n_classes
x = tf.placeholder("float", [n_input_x, n_input_y, n_input_z])
y = tf.placeholder("float", [n_input_x, n_input_y, n_input_z, n_classes])
weights = {}
biases = {}
for i in xrange(n_input_y * n_input_z):
weights[i] = tf.Variable(tf.random_normal([n_hidden, n_output]))
biases[i] = tf.Variable(tf.random_normal([n_output]))
#generate random data
input_data = np.random.rand(n_input_x, n_input_y, n_input_z)
ground_truth = np.random.rand(n_input_x, n_input_y, n_input_z, n_classes)
#build GridLSTM
def GridLSTM_network(x):
x = tf.reshape(x, [-1,n_input_x])
x = tf.split(0, n_input_y * n_input_z, x)
lstm_cell = grid_rnn_cell.Grid3LSTMCell(n_hidden)
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
output = []
for i in xrange(n_input_y * n_input_z):
output.append(tf.matmul(outputs[i], weights[i]) + biases[i])
return output
#initialize network, cost, optimizer and all variables
pred = GridLSTM_network(x)
# import pdb
# pdb.set_trace()
pred = tf.pack(pred)
pred = tf.transpose(pred,[1,0,2])
pred= tf.reshape(pred, [-1, n_input_x, n_input_y, n_input_z, n_classes])
temp_pred = tf.reshape(pred, [-1,n_classes])
temp_y = tf.reshape(y,[-1, n_classes])
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(temp_pred, temp_y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(0,tf.cast(tf.sub(tf.nn.sigmoid(temp_pred),temp_y), tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
step = 0
while 1:
print step
step = step + 1
# pdb.set_trace
sess.run(optimizer, feed_dict={x: input_data, y: ground_truth})

Resources