I want to create a denoising autoencoder for images of any shape. Most of the solutions out there have image shape not greater than (500,500) while the images I have are document scans of shape (3000,2000). I tried to reshape the images and build the model, but the predictions are incorrect. Could someone help me?
I have tried to build model with the code here https://github.com/mrdragonbear/Autoencoders/blob/master/Autoencoder-Tutorial.ipynb, playing around the image shape but the predictions fails.
I have a document denoiser already.
There is no need to have a model for a large shape, you can simply split them, feed them to the model, and then merge the predicted chunks together again.
My model accepts images of shape 512x512, so I have to split the images by 512x512 chunks.
The images must be larger than or equal to 512x512.
If the image is smaller then all you need is to resize it or fit it in a 512x512 shape.
def split_page(page):
chunk_size = (512, 512)
main_size = page.shape[:2]
chunks=[]
chunk_grid = tuple(np.array(main_size)//np.array(chunk_size))
extra_chunk = tuple(np.array(main_size)%np.array(chunk_size))
for yc in range(chunk_grid[0]):
row = []
for xc in range(chunk_grid[1]):
chunk = page[yc*chunk_size[0]:yc*chunk_size[0]+chunk_size[0], xc*chunk_size[1]: xc*chunk_size[1]+chunk_size[1]]
row.append(chunk)
if extra_chunk[1]:
chunk = page[yc*chunk_size[0]:yc*chunk_size[0]+chunk_size[0], page.shape[1]-chunk_size[1]:page.shape[1]]
row.append(chunk)
chunks.append(row)
if extra_chunk[0]:
row = []
for xc in range(chunk_grid[1]):
chunk = page[page.shape[0]-chunk_size[0]:page.shape[0], xc*chunk_size[1]: xc*chunk_size[1]+chunk_size[1]]
row.append(chunk)
if extra_chunk[1]:
chunk = page[page.shape[0]-chunk_size[0]:page.shape[0], page.shape[1]-chunk_size[1]:page.shape[1]]
row.append(chunk)
chunks.append(row)
return chunks, page.shape[:2]
def merge_chunks(chunks, osize):
extra = np.array(osize)%512
page = np.ones(osize)
for i, row in enumerate(chunks[:-1]):
for j, chunk in enumerate(row[:-1]):
page[i*512:i*512+512,j*512:j*512+512]=chunk
page[i*512:i*512+512,osize[1]-512:osize[1]]=chunks[i,-1]
if extra[0]:
for j, chunk in enumerate(chunks[-1][:-1]):
page[osize[0]-512:osize[0],j*512:j*512+512]=chunk
page[osize[0]-512:osize[0],osize[1]-512:osize[1]]=chunks[-1,-1]
else:
for j, chunk in enumerate(chunks[-1][:-1]):
page[osize[0]-512:osize[0],j*512:j*512+512]=chunk
page[osize[0]-512:osize[0],osize[1]-512:osize[1]]=chunks[-1,-1]
return page
def denoise(chunk):
chunk = chunk.reshape(1,512,512,1)/255.
denoised = model.predict(chunk).reshape(512,512)*255.
return denoised
def denoise_page(page):
chunks, osize= split_page(page)
chunks = np.array(chunks)
denoised_chunks = np.ones(chunks.shape)
for i, row in enumerate(chunks):
for j, chunk in enumerate(row):
denoised = denoise(chunk)
denoised_chunks[i][j]=denoised
denoised_page = merge_chunks(denoised_chunks, osize)
return denoised_page
Related
According to Tensorflow documentation, the padding is always with zeros instead of ones.
Is there there a way to change the padding to ones?
If not, what is the best alternative for a tensorflow dataset?
Here is my code example:
def resize_with_pad(image, label):
image = tf.image.resize_with_pad(image=image,
target_height=resized_wh,
target_width=resized_wh,
method=ResizeMethod.BILINEAR,
antialias=False)
return image, label
def create_tf_dataset_pipeline(tf_dataset):
tf_dataset = tf_dataset.map(load_image, num_parallel_calls=AUTOTUNE)
tf_dataset = tf_dataset.map(normalize, num_parallel_calls=AUTOTUNE)
tf_dataset = tf_dataset.map(resize_with_pad, num_parallel_calls=AUTOTUNE)
tf_dataset = tf_dataset.batch(batch_size)
tf_dataset = tf_dataset.prefetch(AUTOTUNE)
return tf_dataset
train_data = tf.data.Dataset.from_tensor_slices((x_train_filepaths, y_train_class))
train_data = create_tf_dataset_pipeline(train_data)
I tried resizing and padding the images and saving it in a directory (i.e. frontloading the processing), but that is very inflexible as I need to create a new dataset every time I want to train a model on a different size. It would be much better if I could do it dynamically with tensor flow.
AS a part of my Master's thesis, I have trained a UNET using Pytorch for detecting some objects in X-ray images. And to generate the predications, I have implemented the following function:
def make_predictions(model, imagePath):
# set model to evaluation mode
model.eval()
# turn off gradient tracking
with torch.no_grad():
# load the image from disk, expand its dimensions, cast it
# to float data type, and scale its pixel values
image = cv2.imread(imagePath, 0)
image = np.expand_dims(image, 0)
image = np.expand_dims(image, 0)
image = image.astype("float32") / 255.0
# find the filename and generate the path to ground truth mask
filename = imagePath.split(os.path.sep)[-1]
groundTruthPath = os.path.join(Config.Mask_dataset_dir, filename)
# load the ground-truth segmentation mask in grayscale mode and resize it
gtMask = cv2.imread(groundTruthPath, 0)
gtMask = cv2.resize(gtMask, (Config.Input_Height, Config.Input_Height))
# create a PyTorch tensor, and flash it to the current device
image = torch.from_numpy(image).to(Config.DEVICE)
# make the prediction, pass the results through the sigmoid
# function, and convert the result to a NumPy array
predMask = model(image)
predMask = torch.sigmoid(predMask)
predMask = predMask.cpu().numpy()
# filter out the weak predictions and convert them to integers
predMask = (predMask > Config.Thresh) * 255
predMask = predMask.astype(np.uint8)
filename = imagePath.split(os.path.sep)[-1]
cv2.imwrite(Config.Base_Out+'\\'+filename, predMask)
return(gtMask, predMask)
This function runs well for making the predictions and even plotting them. but the function cv2.imwrite() doesn't save the predictions as images in the passed directory, noting that filename already has the .PNG extension at the end. What could be the problem here?
i was trying to create a trackbar window and get hsv value of the image by adjusting the trackbar. created a mask and then adjusted the trackbar to detect an object of the hsv image
enter code here
def nothing(x):
pass
cv.namedWindow("Tracking")
cv.createTrackbar("LH","Tracking",0,255,nothing)
cv.createTrackbar("LS","Tracking",0,255,nothing)
cv.createTrackbar("LV","Tracking",0,255,nothing)
cv.createTrackbar("UH","Tracking",255,255,nothing)
cv.createTrackbar("US","Tracking",255,255,nothing)
cv.createTrackbar("UV","Tracking",255,255,nothing)
while True:
frame = cv.imread("C:/Users/acer/Desktop/insects/New folder/ins.jpg")
hsv = cv.cvtColor(frame,cv.COLOR_BGR2HSV)
l_h = cv.getTrackbarPos("LH","Tracking")
l_s = cv.getTrackbarPos("LS","Tracking")
l_v = cv.getTrackbarPos("LV","Tracking")
u_h = cv.getTrackbarPos("UH","Tracking")
u_s = cv.getTrackbarPos("US","Tracking")
u_v = cv.getTrackbarPos("UV","Tracking")
l_b = np.array([l_h,l_s,l_v])
u_b = np.array([u_h,u_s,u_v])
mask = (hsv,l_b,u_b)
res = cv.bitwise_and(frame,frame,mask=mask)
cv.imshow("frame",frame)
cv.imshow("mask",mask)
cv.imshow("res",res)
key = cv.waitKey(1)
if key == 27:
break
cv.destroyAllWindows()
There are a few issues with your code:
1) You have no import statements. You need at least:
import cv2 as cv
import numpy as np
2) Your indentation is incorrect. Your function nothing() should not be indented.
3) You omitted to call inRange(), you need:
mask = cv.inRange(hsv,l_b,u_b)
4) You have scaled the Hue into the range 0..255 when it actually has the range 0..180 when used with uint8 images so that 360 degrees comes out as 180 degrees which is less than the 255 upper limit of uint8.
By the way, it is fairly poor practice to do "loop invariant" stuff inside a loop - I mean the part where you hit the disk every millisecond and re-read the image, re-decode the JPEG and convert it to HSV. All that can be done outside the loop, then inside it, just do a quick memory copy of the HSV image.
I am trying to use LSTM autoencoder to do sequence-to-sequence learning with variable lengths of sequences as inputs, using following code:
inputs = Input(shape=(None, input_dim))
masked_input = Masking(mask_value=0.0, input_shape=(None,input_dim))(inputs)
encoded = LSTM(latent_dim)(masked_input)
decoded = RepeatVector(timesteps)(encoded)
decoded = LSTM(input_dim, return_sequences=True)(decoded)
sequence_autoencoder = Model(inputs, decoded)
encoder = Model(inputs, encoded)
where inputs are raw sequence data padded with 0s to the same length (timesteps). Using the code above, the output is also of length timesteps, but when we calculate loss function we only want first Ni elements of the output (where Ni is length of input sequence i, which may be different for different sequences). Does anyone know if there is some good way to do that?
Thanks!
Option 1: you can always train without padding if you accept to train separate batches.
See this answer to a simple way of separating batches of equal length: Keras misinterprets training data shape
In this case, all you have to do is to perform the "repeat" operation in another manner, since you don't have the exact length at training time.
So, instead of RepeatVector, you can use this:
import keras.backend as K
def repeatFunction(x):
#x[0] is (batch,latent_dim)
#x[1] is inputs: (batch,length,features)
latent = K.expand_dims(x[0],axis=1) #shape(batch,1,latent_dim)
inpShapeMaker = K.ones_like(x[1][:,:,:1]) #shape (batch,length,1)
return latent * inpShapeMaker
#instead of RepeatVector:
Lambda(repeatFunction,output_shape=(None,latent_dim))([encoded,inputs])
Option2 (doesn't smell good): use another masking after RepeatVector.
I tried this, and it works, but we don't get 0's at the end, we get the last value repeated until the end. So, you will have to make a weird padding in your target data, repeating the last step until the end.
Example: target [[[1,2],[5,7]]] will have to be [[[1,2],[5,7],[5,7],[5,7]...]]
This may unbalance your data a lot, I think....
def makePadding(x):
#x[0] is encoded already repeated
#x[1] is inputs
#padding = 1 for actual data in inputs, 0 for 0
padding = K.cast( K.not_equal(x[1][:,:,:1],0), dtype=K.floatx())
#assuming you don't have 0 for non-padded data
#padding repeated for latent_dim
padding = K.repeat_elements(padding,rep=latent_dim,axis=-1)
return x[0]*padding
inputs = Input(shape=(timesteps, input_dim))
masked_input = Masking(mask_value=0.0)(inputs)
encoded = LSTM(latent_dim)(masked_input)
decoded = RepeatVector(timesteps)(encoded)
decoded = Lambda(makePadding,output_shape=(timesteps,latent_dim))([decoded,inputs])
decoded = Masking(mask_value=0.0)(decoded)
decoded = LSTM(input_dim, return_sequences=True)(decoded)
sequence_autoencoder = Model(inputs, decoded)
encoder = Model(inputs, encoded)
Option 3 (best): crop the outputs directly from the inputs, this also eliminates the gradients
def cropOutputs(x):
#x[0] is decoded at the end
#x[1] is inputs
#both have the same shape
#padding = 1 for actual data in inputs, 0 for 0
padding = K.cast( K.not_equal(x[1],0), dtype=K.floatx())
#if you have zeros for non-padded data, they will lose their backpropagation
return x[0]*padding
....
....
decoded = LSTM(input_dim, return_sequences=True)(decoded)
decoded = Lambda(cropOutputs,output_shape=(timesteps,input_dim))([decoded,inputs])
For this LSTM Autoencoder architecture, which I assume you understand, the Mask is lost at the RepeatVector due to the LSTM encoder layer having return_sequences=False.
So another option, instead of cropping like above, could also be to create custom bottleneck layer that propagates the mask.
I am trying to find an efficient way to see if one image is a subset of another (meaning that each unique pixel in one image is also found in the other.) The repetition or ordering of the pixels do not matter.
I am working in Java, so I would like all of my operations to be completed in OpenCV for efficiency's sake.
My first idea was to export a list of unique pixel values, and compare it to the list from the second image.
As there is not a built in function to extract unique pixels, I abandoned this approach.
I also understand that I can find the locations of a particular color with the inclusive inRange, and findNonZero operations.
Core.inRange(image, color, color, tempMat); // inclusive
Core.findNonZero(tempMat, colorLocations);
Unfortunately, this does not provide an adequate answer, as it would need to be executed per color, and would still require extracting unique pixels.
Essentially, I'm asking if there is a clever way to use the built in OpenCV functions to see if an image is comprised of the pixels found in another image.
I understand that this will not work for slight color differences. I am working on a limited dataset, and care about the exact pixel values.
To put the question more mathematically:
Because the only think you are interested in is the pixel values i would suggest to do the following.
Compute the histogram of image 1 using hist1 = calcHist()
Compute the histogram of image 2 using hist2 = calcHist()
Calculate the difference vector diff = hist1 - hist2
Check if each bin of the hist of the subimage is less or equal than the corresponding bin in the hist of the bigger image
Thanks to Miki for the fix.
I will keep Amitay's as the accepted answer, as he absolutely lead me down the correct path. I wanted to also share my exact answer for anyone who finds this in the future.
As I stated in my question, I was looking for an efficient way to see if the RGB values of one image were a subset of the RGB values of another image.
I made a function to the following specification:
The Java code is as follows:
private boolean isSubset(Mat subset, Mat subMask, Mat superset) {
// Get unique set of pixels from both images
subset = getUniquePixels(subset, subMask);
superset = getUniquePixels(superset, null);
// See if the superset pixels encapsulate the subset pixels
// OR the unique pixels together
Mat subOrSuper = new Mat();
Core.bitwise_or(subset, superset, subOrSuper);
//See if the ORed matrix is equal to the superset
Mat notEqualMat = new Mat();
Core.compare(superset, subOrSuper, notEqualMat, Core.CMP_NE);
return Core.countNonZero(notEqualMat) == 0;
}
subset and superset are assumed to be CV_8UC3 matricies, while subMask is assumed to be CV_8UC1.
private Mat getUniquePixels(Mat img, Mat mask) {
if (mask == null) {
mask = new Mat();
}
// int bgrValue = (b << 16) + (g << 8) + r;
img.convertTo(img, CvType.CV_32FC3);
Vector<Mat> splitImg = new Vector<>();
Core.split(img, splitImg);
Mat flatImg = Mat.zeros(img.rows(), img.cols(), CvType.CV_32FC1);
Mat multiplier;
for (int i = 0; i < splitImg.size(); i++) {
multiplier = Mat.ones(img.rows(), img.cols(), CvType.CV_32FC1);
// set powTwo = to 2^i;
int powTwo = (1 << i);
// Set multiplier matrix equal to powTwo;
Core.multiply(multiplier, new Scalar(powTwo), multiplier);
// n<<i == n * 2^i;
// I'm shifting the RGB values into separate parts of the same 32bit
// integer.
Core.multiply(multiplier, splitImg.get(i), splitImg.get(i));
// Add the shifted RGB components together.
Core.add(flatImg, splitImg.get(i), flatImg);
}
// Create a histogram of the pixel values.
List<Mat> images = new ArrayList<>();
images.add(flatImg);
MatOfInt channels = new MatOfInt(0);
Mat hist = new Mat();
// 16777216 == 256*256*256
MatOfInt histSize = new MatOfInt(16777216);
MatOfFloat ranges = new MatOfFloat(0f, 16777216f);
Imgproc.calcHist(images, channels, mask, hist, histSize, ranges);
Mat uniquePixels = new Mat();
Core.inRange(hist, new Scalar(1), new Scalar(Float.MAX_VALUE), uniquePixels);
return uniquePixels;
}
Please feel free to ask questions, or point out problems!