PyTorch augmentation - machine-learning

i'm new to machine learning and pytorch. I'm using imgaug library for images augmentation (
I have this code:
class ImgAugTransform:
def __init__(self):
self.aug = seq = iaa.Sequential(
# Apply the following augmenters to most images
iaa.Fliplr(0.5), # horizontally flip 50% of all images
iaa.Flipud(0.2), # vertically flip 20% of all images
random_aug_use(iaa.CropAndPad( # crop images by -5% to 10% of their height/width
percent=(-0.1, 0.2),
scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis
translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis)
rotate=(-45, 45), # rotate by -45 to +45 degrees
shear=(-16, 16), # shear by -16 to +16 degrees
order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)
cval=(0, 255), # if mode is constant, use a cval between 0 and 255
mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)
def __call__(self, img):
img = np.array(img)
return self.aug.augment_image(img)
train_transforms = ImgAugTransform()
train_dataset = torchvision.datasets.ImageFolder(train_dir, train_transforms)
train_dataloader =
train_dataset, batch_size=batch_size, shuffle=True, num_workers=batch_size)
So now i cant do this:
X_batch, y_batch = next(iter(train_dataloader))
I get error:
ValueError: some of the strides of a given numpy array are negative. This is currently not supported, but will be added in future releases.

I came across this error as well.
The solution that worked to me was:
def __call__(self, img):
img = np.array(img)
return self.aug.augment_image(img).copy()
But, if you're composing imgaug with torchvision.transforms you can do something like:
def __call__(self, img):
img = self.aug.augment_image(np.array(img))
transforms = torchvision.transforms.Compose([
return transforms(img.copy())

You should make your augmented numpy arrays contiguous again.
try modifying your augmenter code to:
def __call__(self, img):
img = np.array(img)
return np.ascontiguousarray(self.aug.augment_image(img))


PyTorch - scaling data for training and then rescaling results back

I am working on an autoencoder network using pytorch. I have a dataset of rows that have 10 columns each containing values in roughly [-0.2, 0.2].
Since all builtin function for automated data preparation I know about work for images and other data types, I assume I have to rescale these into [0, 1] range myself, train the network and then scale every result back into the original dataset's size scale.
The scaling algorithm I used was (input is scaled data for training, output is result of network):
input -= min(data)
input /= max(input)
output *= (abs(min(data)) + max(data)) //last division was by "shifted" max
output += min(data)
Here is an actual code:
class AirfoilDataset(
def __init__(self, data):
self.airfoils = np.copy(data)
self.airfoils -= self.airfoils.min()
self.airfoils /= self.airfoils.max()
def __len__(self):
return len(self.airfoils)
def __getitem__(self, idx):
return torch.from_numpy(self.airfoils[idx]), idx
class Autoencoder(torch.nn.Module):
def __init__(self):
self.encoder = torch.nn.Sequential(
torch.nn.Linear(10, 5),
self.decoder = torch.nn.Sequential(
torch.nn.Linear(5, 10),
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
The results I get from this are really bad, but somehow deformed (don't know the proper terminology). It visibly follows the shape of the original dataset, but really badly.
On the other hand, if I don't scale the data put into training, the positive range of original dataset is represented perfectly by the autoencoder, without distortions. Obviously, the negative part is reduced to zero.
How to preserve "shape" of input dataset through training?
You can use sklearn for that
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
#here your model
output = scaler.inverse_transform(output)

Resizing inputs for torch model

I'm facing with this error properly and I could not see any exact solution or a solution formula for this error. My inputs are like (48x48) and that's not matching with the input shape of the resnet101. How can I edit my input to fit to the resnet101? You can see my code below, it probably helps you to understand my problem.
if __name__ == "__main__":
vid = cv2.VideoCapture(0)
emotions = []
while vid.isOpened():
image = cv2.imread("/home/berkay/Desktop/angry_man.jpg")
_, frame =
# takes in a gray coloured filter of the frame
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# initializing the haarcascade face detector
faces = face_cascade.detectMultiScale(frame)
for (x,y,w,h) in faces:
# takes the region of interest of the face only in gray
roi_gray = gray[y:y+h, x:x+h]
resized = cv2.resize(roi_gray, (48, 48)) # resizes to 48x48 sized image
# predict the mood
img = img2tensor(resized)
prediction = predict(img)
In that point, I'm getting this error:
weight of size [64, 3, 7, 7], expected input[1, 1, 229, 229] to have 3 channels, but got 1 channels instead
How can I fix this? Thanks in advance
You can modify the input layer of resnet so that it would accept a single-channel tensors inputs using
In [1]: model = resnet101()
In [2]: model.conv1 = nn.Conv2d(1, 64, kernel_size=(2, 2))
In [3]: model(torch.rand(10, 1, 48, 48))
tensor([[-0.5015, 0.6124, 0.1370, ..., 1.2181, -0.4707, 0.3285],
[-0.4776, 1.1027, 0.0161, ..., 0.6363, -0.4733, 0.6218],
[-0.3935, 0.8276, -0.0316, ..., 0.6853, -0.4735, 0.6424],
[-0.2986, 1.1758, 0.0158, ..., 0.7422, -0.4422, 0.4792],
[-0.2668, 0.7884, -0.1205, ..., 1.1445, -0.6249, 0.6697],
[-0.2139, 1.0412, 0.2326, ..., 0.8332, -0.8744, 0.4827]],
(you will probably need to modify the kernel size accordingly too)

How to count vehicles using opencv in python?

I am working on a VCS (vehicle counting system) project. The scope of the project is to classify and count vehicles. I have built a custom model using Faster-RCNN in Tensorflow-object-detection-API This model only contains 7 classes such as car motorbike, bicycle and etc. The model works perfectly, But, the problem is "COUNTING". It is very hard to count vehicles in video frame. I did a pre-research on the internet. I tried a lot. but i could not find any useful information. There are some projects on github, they use tracking methods.
I want the following things. I want to draw an horizontal line in the frame. when the vehicle touch it, the counting should take place. How to do it. I don't know the algorithm behind it. I heard that centroid tracking would help me.
My question is, i want to count vehicles when it touch the horizontal line. I have linked a sample image bellow.
import os
import cv2
import numpy as np
import tensorflow as tf
import sys
# This is needed since the notebook is stored in the object_detection folder.
# Import utilites
from utils import label_map_util
from utils import visualization_utils as vis_util
# Name of the directory containing the object detection module we're using
MODEL_NAME = 'inference_graph'
VIDEO_NAME = 'Video_105.mp4'
# Grab path to current working directory
CWD_PATH = os.getcwd()
# Path to frozen detection graph .pb file, which contains the model that is used
# for object detection.
PATH_TO_CKPT = os.path.join(CWD_PATH,MODEL_NAME,'frozen_inference_graph.pb')
# Path to label map file
PATH_TO_LABELS = os.path.join(CWD_PATH,'training','labelmap.pbtxt')
# Path to video
# Number of classes the object detector can identify
# Load the label map.
# Label maps map indices to category names, so that when our convolution
# network predicts `5`, we know that this corresponds to `king`.
# Here we use internal utility functions, but anything that returns a
# dictionary mapping integers to appropriate string labels would be fine
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# Load the Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph =
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=detection_graph)
# Define input and output tensors (i.e. data) for the object detection classifier
# Input tensor is the image
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Output tensors are the detection boxes, scores, and classes
# Each box represents a part of the image where a particular object was detected
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represents level of confidence for each of the objects.
# The score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
# Number of objects detected
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Open video file
video = cv2.VideoCapture(PATH_TO_VIDEO)
# Acquire frame and expand frame dimensions to have shape: [1, None, None, 3]
# i.e. a single-column array, where each item in the column has the pixel RGB value
ret, frame =
frame_expanded = np.expand_dims(frame, axis=0)
# Perform the actual detection by running the model with the image as input
(boxes, scores, classes, num) =
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: frame_expanded})
# Draw the results of the detection (aka 'visulaize the results')
# All the results have been drawn on the frame, so it's time to display it.
final_score = np.squeeze(scores)
count = 0
cv2.line(frame, (1144, 568), (1723,664), (0,0,255), 2) #Line
for i in range(100):
if scores is None or final_score[i] > 0.90:
min_score_thresh = 0.90
bboxes = boxes[scores > min_score_thresh]
im_width = video.get(cv2.CAP_PROP_FRAME_WIDTH)
im_height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
final_box = []
for box in bboxes:
ymin, xmin, ymax, xmax = box
final_box.append([xmin * im_width, xmax * im_width, ymin * im_height, ymax * im_height])
cv2.imshow('Object detector', frame)
# Press 'q' to quit
if cv2.waitKey(1) == ord('q'):
# Clean up
# import the necessary packages
from import VideoStream
from import FPS
import argparse
import imutils
import time
import cv2
tracker = cv2.TrackerCSRT_create()
vs = cv2.VideoCapture("Video.mp4")
initBB = None
detec = []
def pega_centro(x, y, w, h):
x1 = int(w / 2)
y1 = int(h / 2)
cx = x + x1
cy = y + y1
return cx,cy
roi = 480
counter = 0
offset = 6
# loop over frames from the video stream
while vs.isOpened():
ret,frame =
cv2.line(frame, (769 , roi), (1298 , roi), (255,0,0), 3)
# check to see if we are currently tracking an object
if initBB is not None:
# grab the new bounding box coordinates of the object
(success, box) = tracker.update(frame)
# check to see if the tracking was a success
if success:
(x, y, w, h) = [int(v) for v in box]
cv2.rectangle(frame, (x, y), (x + w, y + h),
(0, 255, 0), 2)
cX = int((x + x+w) / 2.0)
cY = int((y + y+h) / 2.0), (cX, cY), 3, (0, 0, 255), -1)
c=pega_centro(x, y, w, h)
for (x,y) in detec:
if y<(roi+offset) and y>(roi-offset):
cv2.line(frame, (769 , roi), (1298 , roi), (0,0,255), 3)
# show the output frame
cv2.imshow("Frame", frame)
key = cv2.waitKey(1) & 0xFF
if key == ord("s"):
# select the bounding box of the object we want to track (make
# sure you press ENTER or SPACE after selecting the ROI)
initBB = cv2.selectROI("Frame", frame, fromCenter=False,
# start OpenCV object tracker using the supplied bounding box
# coordinates, then start the FPS throughput estimator as well
tracker.init(frame, initBB)
fps = FPS().start()
# if the `q` key was pressed, break from the loop
elif key == ord("q"):

Putting my pictures over black ground not working OpenCV

So this is what I have now:
As you can see, the neural style transfer thing is only going over the area the detection box is detecting. I am trying to put the transformed cool picture (which will always be less than 1200 x 900 because the detection box is 1200 x 900) in a black picture with dimensions 1200 x 900 so that I can save the video file.
My box is measured with: startX, endX, startY, and endY. The way I am trying to put the cool picture over the background right now is: black_background[startY:endY, startX:endX] = output, where output also has the size (endY - startY, endX - startX).
My way is not working, any insights? And also, for some reason, when I do "*black_background[startY:endY, startX:endX] = output", there is often a few pixel off broadcasting issue, like can't add (859, 100, 3) with (860, 100, 3). Is there a non-buggy solution to the black background issue? I feel like manually doing *black_background[startY:endY, startX:endX] = output is weird.
Here's my full code, I marked the if loop that actually matters with -----, thank you!
from __future__ import print_function
from import VideoStream
from import FPS
import numpy as np
import argparse
import imutils
import time
import cv2
from imutils import paths
import itertools
# We need to input model prototxt
ap = argparse.ArgumentParser()
ap.add_argument("-p", "--prototxt", required=True,
help="path to Caffe 'deploy' prototxt file")
ap.add_argument("-m", "--model", required=True,
help="path to Caffe pre-trained model")
ap.add_argument("-c", "--confidence", type=float, default=0.2,
help="minimum probability to filter weak detections")
ap.add_argument("-nm", "--neuralmodels", required=True,
help="path to directory containing neural style transfer models")
args = vars(ap.parse_args())
# we should identify the class first, and then transfer that block
CLASSES = ["background", "aeroplane", "bicycle", "bird", "boat",
"bottle", "bus", "car", "cat", "chair", "cow", "diningtable",
"dog", "horse", "motorbike", "person", "pottedplant", "sheep",
"sofa", "train", "tvmonitor"]
COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))
# load our serialized model from disk
print("[INFO] loading model...")
DetectionNet = cv2.dnn.readNetFromCaffe(args["prototxt"], args["model"])
# grab the paths to all neural style transfer models in our 'models'
# directory, provided all models end with the '.t7' file extension
modelPaths = paths.list_files(args["neuralmodels"], validExts=(".t7",))
modelPaths = sorted(list(modelPaths))
# generate unique IDs for each of the model paths, then combine the
# two lists together
models = list(zip(range(0, len(modelPaths)), (modelPaths)))
# use the cycle function of itertools that can loop over all model
# paths, and then when the end is reached, restart again
modelIter = itertools.cycle(models)
(modelID, modelPath) = next(modelIter)
NTSnet = cv2.dnn.readNetFromTorch(modelPath)
# initialize the video stream, allow the cammera sensor to warmup,
# and initialize the FPS counter
print("[INFO] starting video stream...")
vs = VideoStream(src=1).start()
fps = FPS().start()
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_video = cv2.VideoWriter('output.avi', fourcc, 20.0, (1200, 900))
while True:
# grab the frame from the threaded video stream and resize it
# to have a maximum width of 400 pixels
frame =
frame = imutils.resize(frame, width=1200, height=900)
# grab the frame dimensions and convert it to a blob
(h, w) = frame.shape[:2]
blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)),
0.007843, (300, 300), 127.5)
# pass the blob through the network and obtain the detections and
# predictions
detections = DetectionNet.forward()
# loop over the detections
for i in np.arange(0, detections.shape[2]):
# extract the confidence (i.e., probability) associated with
# the prediction
confidence = detections[0, 0, i, 2]
# filter out weak detections by ensuring the `confidence` is
# greater than the minimum confidence
if confidence > args["confidence"]:
# extract the index of the class label from the
# `detections`, then compute the (x, y)-coordinates of
# the bounding box for the object
idx = int(detections[0, 0, i, 1])
if(CLASSES[idx] == "person" and confidence > .90):
box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
(startX, startY, endX, endY) = box.astype("int")
# draw the prediction on the frame
label = "{}: {:.2f}%".format("PERSON",
confidence * 100)
cv2.rectangle(frame, (startX, startY), (endX, endY),
COLORS[idx], 2)
y = startY - 15 if startY - 15 > 15 else startY + 15
cv2.putText(frame, label, (startX, y),
# print box area in background
newimage = frame[startY:endY, startX:endX]
(h, w) = newimage.shape[:2]
#print(startX, endX, startY, endY)
noise_picture = cv2.imread('white_noise.jpg')
black_background = cv2.imread('black.png')
*if(h > 0 and w > 0):
# to_be_transformed is the detection box area
# resize that area for MobileNetSSD
#to_be_transformed = imutils.resize(to_be_transformed, height=450)
(height_orig, width_orig) = noise_picture.shape[:2]
noise_picture[startY:endY, startX:endX] = newimage
noise_picture = imutils.resize(noise_picture, height=450)
# run it through the network, output is the image
(h, w) = noise_picture.shape[:2]
# print(h, w)
blob2 = cv2.dnn.blobFromImage(noise_picture, 1.0, (w, h), (103.939, 116.779, 123.680), swapRB=False, crop=False)
output = NTSnet.forward()
output = output.reshape((3, output.shape[2], output.shape[3]))
output[0] += 103.939
output[1] += 116.779
output[2] += 123.680
output /= 255.0
output = output.transpose(1, 2, 0)
# set the 600 x 450 back to the original size
black_background = imutils.resize(black_background, width=1200, height = 900)
output = imutils.resize(output, width=1200)
#black_background[startY:endY, startX:endX] = output[startY:endY, startX:endX]
output = output[startY:endY, startX:endX]
(h2, w2) = output.shape[:2]
if(h2>0 and w2>0 ):
cv2.imshow('hmm', output)
black_background[startY:endY, startX:endX] = output
cv2.imshow("uh", black_background)
# show the output frame, which is the whole thing
cv2.imshow("Frame", frame)
key = cv2.waitKey(1) & 0xFF
# if the `q` key was pressed, break from the loop
if key == ord("q"):
# update the FPS counter
# stop the timer and display FPS information
print("[INFO] elapsed time: {:.2f}".format(fps.elapsed()))
print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))
# do a bit of cleanup
Oh man, second time I made this mistake. You have to do * 255 when you are adding your output picture to your background. This is really weird, it seems like imread works if you only put numbers in [0, 1], but once you have a value that goes over 1, it treats the range as [0, 255], don't take my words on it though.

TensorFlow average gradients over several batches

This is a possible duplicate of Tensorflow: How to get gradients per instance in a batch?. I ask it anyway, because there has not been a satisfying answer and the goal here is a bit different.
I have a very big network that I can fit on my GPU but the max batch size I can feed is 32. Anything bigger than that causes the GPU to run out of memory. I want to use a bigger batch in order to get a more accurate approximation of the gradient.
For concreteness, let's say I want to compute the gradient on a big batch of size 96 by feeding 3 batches of 32 in turn. The best way that I know of is to use Optimizer.compute_gradients() and Optimizer.apply_gradients(). Here is a small example how it can work
import tensorflow as tf
import numpy as np
learn_rate = 0.1
W_init = np.array([ [1,2,3], [4,5,6], [7,8,9] ], dtype=np.float32)
x_init = np.array([ [11,12,13], [14,15,16], [17,18,19] ], dtype=np.float32)
X = tf.placeholder(dtype=np.float32, name="x")
W = tf.Variable(W_init, dtype=np.float32, name="w")
y = tf.matmul(X, W, name="y")
loss = tf.reduce_mean(y, name="loss")
opt = tf.train.GradientDescentOptimizer(learn_rate)
grad_vars_op = opt.compute_gradients(loss)
sess = tf.Session()
# Compute the gradients for each batch
grads_vars1 =, feed_dict = {X: x_init[None,0]})
grads_vars2 =, feed_dict = {X: x_init[None,1]})
grads_vars3 =, feed_dict = {X: x_init[None,2]})
# Separate the gradients from the variables
grads1 = [ grad for grad, var in grads_vars1 ]
grads2 = [ grad for grad, var in grads_vars2 ]
grads3 = [ grad for grad, var in grads_vars3 ]
varl = [ var for grad, var in grads_vars1 ]
# Average the gradients
grads = [ (g1 + g2 + g3)/3 for g1, g2, g3 in zip(grads1, grads2, grads3)],varl)))
print("Weights after 1 gradient")
Now this is all very ugly and inefficient since the forward pass is being run on the GPU while averaging the gradients happens on the CPU and then applying them happens on the GPU again.
Moreover, this code throws an exception because grads is a list of np.arrays and to make it work, one would have to create a tf.placeholder for every gradient.
I am sure there should be a better and more efficient way to do this? Any suggestions?
You can create copy of trainable_variables and accumulate batch gradients. Here's few simple steps to follow
opt = tf.train.GradientDescentOptimizer(learn_rate)
# constant to scale sum of gradient
const = tf.constant(1/n_batches)
# get all trainable variables
t_vars = tf.trainable_variables()
# create a copy of all trainable variables with `0` as initial values
accum_tvars = [tf.Variable(tf.zeros_like(tv.initialized_value()),trainable=False) for t_var in t_vars]
# create a op to initialize all accums vars
zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_tvars]
# compute gradients for a batch
batch_grads_vars = opt.compute_gradients(loss, t_vars)
# collect the (scaled by const) batch gradient into accumulated vars
accum_ops = [accum_tvars[i].assign_add(tf.scalar_mul(const, batch_grad_var[0]) for i, batch_grad_var in enumerate(batch_grads_vars)]
# apply accums gradients
train_step = opt.apply_gradients([(accum_tvars[i], batch_grad_var[1]) for i, batch_grad_var in enumerate(batch_grads_vars)])
# train_step = opt.apply_gradients(zip(accum_tvars, zip(*batch_grads_vars)[1])
while True:
# initialize the accumulated gards
# number of batches for gradient accumulation
n_batches = 3
for i in xrange(n_batches):, feed_dict={X: x_init[:, i]})
