I have a JPG picture on which I'd like to perform some operations in order to use pattern recognition. The picture is being rotated and also some filters like color inversion, greyscale,.. are applied
The program goes like this
for(i=0;i<360;i++){
rotate(pic,i);
foreach(filter as f){
f(pic);
recognize(pic);
}
}
In order to increase speed I'd like to have the source image loaded in RAM and then read from there. Is it possible?
You can write the image to mpr:, or clone the image instance to a new structure. Regardless if where the original source is in memory, you will still need to copy the data in the first for loop. Here's an example, in C, that holds a wand instance and clones each iteration.
#include <stdio.h>
#include <MagickWand/MagickWand.h>
void rotate(MagickWand * wand, double degree) {
PixelWand * pwand = NewPixelWand();
PixelSetColor(pwand, "white");
MagickRotateImage(wand, pwand, degree);
DestroyPixelWand(pwand);
}
void _f(MagickWand * wand, FilterTypes filter) {
double x,y;
x = y = 0.0;
MagickResampleImage(wand, x, y, filter);
}
void recognize(MagickWand * wand) {
// ???
}
int main(int argc, const char * argv[]) {
MagickWandGenesis();
MagickWand * wand, * copy_wand;
wand = NewMagickWand();
MagickReadImage(wand, "rose:");
for ( int i = 0; i < 360 ; i++ ) {
copy_wand = CloneMagickWand(wand);
for ( FilterTypes f = UndefinedFilter; f < SentinelFilter; f++ ) {
_f(copy_wand, f);
recognize(copy_wand);
}
}
MagickWandTerminus();
return 0;
}
The MPR writes to a specific page in memory, and can be identified by a user defined label.
MagickReadImage(wand, "rose:");
MagickWriteImage(wand, "mpr:original"); // Save image to "original" label
for ( int i = 0; i < 360 ; i++ ) {
copy_wand = NewMagickWand();
MagickReadImage(copy_wand, "mpr:original"); // Read image from "original" label
for ( FilterTypes f = UndefinedFilter; f < SentinelFilter; f++ ) {
_f(copy_wand, f);
recognize(copy_wand);
}
copy_wand = DestroyMagickWand(copy_wand);
}
The last option I can think of is to copy the image pixel-data into memory, and re-reference it with each iteration. This allows some performance improvements, and I'm thinking OpenMP, but you'll loose a lot of helper methods.
MagickReadImage(wand, "rose:");
size_t w = MagickGetImageWidth(wand);
size_t h = MagickGetImageHeight(wand);
size_t data_length = w * h * 4;
char * data = malloc(data_length);
MagickExportImagePixels(wand, 0, 0, w, h, "RGBA", CharPixel, (void *)data);
for ( int i = 0; i < 360; i++ ) {
long * copy_data = malloc(data_length);
memcpy(copy_data, data, data_length);
As you haven't specified a language or an operating system, I'll show you how to do that with Magick++ in C++ in a Linux/OSX environment:
#include <Magick++.h>
#include <iostream>
using namespace std;
using namespace Magick;
int main(int argc,char **argv)
{
InitializeMagick(*argv);
// Create an image object
Image image;
// Read a file into image object
image.read( "input.gif" );
// Crop the image to specified size (width, height, xOffset, yOffset)
image.crop( Geometry(100,100, 0, 0) );
// Repage the image to forget it was part of something bigger
image.repage();
// Write the image to a file
image.write( "result.gif" );
return 0;
}
Compile with:
g++ -o program program.cpp `Magick++-config --cppflags --cxxflags --ldflags --libs`
You will need an image called input.gif for it to read and that should be bigger than 100x100, so create one with:
convert -size 256x256 xc:gray +noise random input.gif
Related
I want to apply RealSense library's depth filtering (rs2::spatial_filter) on an OpenCV Mat, but it seems like the filter is not being applied. The original depth image and the supposedly filtered depth image look exactly the same.
To load raw depth data into rs2::frame, I used modified #Groch88's answer. One of the changes I made was changing the depth format from RS2_FORMAT_Z16 to RS2_FORMAT_DISTANCE (to be able to load a float depth map) and not loading the RGB part of the frame. The whole source is below.
Why do the original and the filtered images look exactly the same? Am I missing something obvious?
main.cpp:
#include <iostream>
#include <opencv2/opencv.hpp>
#include <librealsense2/rs.hpp>
#include "rsImageConverter.h"
int main()
{
cv::Mat rawDepthImg = load_raw_depth("/path/to/depth/image"); // loads float depth image
rsImageConverter ic{rawDepthImg.cols, rawDepthImg.rows, sizeof(float)};
if ( !ic.convertFrame(rawDepthImg.data, rawDepthImg.data) )
{
fprintf(stderr, "Could not load depth.\n");
exit(1);
}
rs2::frame rsDepthFrame = ic.getDepth();
// Filter
// https://dev.intelrealsense.com/docs/post-processing-filters
rs2::spatial_filter spat_filter;
spat_filter.set_option(RS2_OPTION_FILTER_MAGNITUDE, 2.0f);
spat_filter.set_option(RS2_OPTION_FILTER_SMOOTH_ALPHA, 0.5f);
spat_filter.set_option(RS2_OPTION_FILTER_SMOOTH_DELTA, 20.0f);
// Apply filtering
rs2::frame rsFilteredDepthFrame;
rsFilteredDepthFrame = spat_filter.process(rsDepthFrame);
// Copy filtered depth to OpenCV Mat
cv::Mat filteredDepth = cv::Mat::zeros(rawDepthImg.size(), CV_32F);
memcpy(filteredDepth.data, rsFilteredDepthFrame.get_data(), rawDepthImg.cols * rawDepthImg.rows * sizeof(float));
// Display (un)filtered images
cv::imshow("Original depth", rawDepthImg); // Original image is being shown
cv::imshow("Filtered depth", filteredDepth); // A depth image that looks exactly like the original unfiltered depth map is shown
cv::imshow("Diff", filteredDepth - rawDepthImg); // A black image is being shown
cv::waitKey(0);
return 0;
}
rsImageConverter.h (edited version of #Doch88's code):
#include <librealsense2/rs.hpp>
#include <librealsense2/hpp/rs_internal.hpp>
class rsImageConverter
{
public:
rsImageConverter(int w, int h, int bppDepth);
bool convertFrame(uint8_t* depth_data, uint8_t* color_data);
rs2::frame getDepth() const;
private:
int w = 640;
int h = 480;
int bppDepth = sizeof(float);
rs2::software_device dev;
rs2::software_sensor depth_sensor;
rs2::stream_profile depth_stream;
rs2::syncer syncer;
rs2::frame depth;
int ind = 0;
};
rsImageConverter.cpp (edited version of #Doch88's code):
#include "rsImageConverter.h"
rsImageConverter::rsImageConverter(int w, int h, int bppDepth) :
w(w),
h(h),
bppDepth(bppDepth),
depth_sensor(dev.add_sensor("Depth")) // initializing depth sensor
{
rs2_intrinsics depth_intrinsics{ w, h, (float)(w / 2), (float)(h / 2), (float) w , (float) h , RS2_DISTORTION_BROWN_CONRADY ,{ 0,0,0,0,0 } };
depth_stream = depth_sensor.add_video_stream({ RS2_STREAM_DEPTH, 0, 0,
w, h, 60, bppDepth,
RS2_FORMAT_DISTANCE, depth_intrinsics });
depth_sensor.add_read_only_option(RS2_OPTION_DEPTH_UNITS, 1.f); // setting depth units option to the virtual sensor
depth_sensor.open(depth_stream);
depth_sensor.start(syncer);
}
bool rsImageConverter::convertFrame(uint8_t* depth_data, uint8_t* color_data)
{
depth_sensor.on_video_frame({ depth_data, // Frame pixels
[](void*) {}, // Custom deleter (if required)
w*bppDepth, bppDepth, // Stride and Bytes-per-pixel
(rs2_time_t)ind * 16, RS2_TIMESTAMP_DOMAIN_HARDWARE_CLOCK, ind, // Timestamp, Frame# for potential sync services
depth_stream });
ind++;
rs2::frameset fset = syncer.wait_for_frames();
depth = fset.first_or_default(RS2_STREAM_DEPTH);
return depth;
}
rs2::frame rsImageConverter::getDepth() const
{
return depth;
}
I took the code in https://gist.github.com/kyrs/9adf86366e9e4f04addb (which takes an opencv cv::Mat image as input and converts it to tensor) and I use it to label images with the model inception_v3_2016_08_28_frozen.pb stated in the Tensorflow tutorial (https://www.tensorflow.org/tutorials/image_recognition#usage_with_the_c_api). Everything worked fine when using a batchsize of 1. However, when I increase the batchsize to 2 (or greater), the size of
finalOutput (which is of type std::vector) is zero.
Here's the code to reproduce the error:
// Only for VisualStudio
#define COMPILER_MSVC
#define NOMINMAX
#include <string>
#include <iostream>
#include <fstream>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include "tensorflow/core/public/session.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/framework/tensor.h"
int batchSize = 2;
int height = 299;
int width = 299;
int depth = 3;
int mean = 0;
int stdev = 255;
// Set image paths
cv::String pathFilenameImg1 = "D:/IMGS/grace_hopper.jpg";
cv::String pathFilenameImg2 = "D:/IMGS/lenna.jpg";
// Set model paths
std::string graphFile = "D:/Tensorflow/models/inception_v3_2016_08_28_frozen.pb";
std::string labelfile = "D:/Tensorflow/models/imagenet_slim_labels.txt";
std::string InputName = "input";
std::string OutputName = "InceptionV3/Predictions/Reshape_1";
void read_prepare_image(cv::String pathImg, cv::Mat &imgPrepared) {
// Read Color image:
cv::Mat imgBGR = cv::imread(pathImg);
// Now we resize the image to fit Model's expected sizes:
cv::Size s(height, width);
cv::Mat imgResized;
cv::resize(imgBGR, imgResized, s, 0, 0, cv::INTER_CUBIC);
// Convert the image to float and normalize data:
imgResized.convertTo(imgPrepared, CV_32FC1);
imgPrepared = imgPrepared - mean;
imgPrepared = imgPrepared / stdev;
}
int main()
{
// Read and prepare images using OpenCV:
cv::Mat img1, img2;
read_prepare_image(pathFilenameImg1, img1);
read_prepare_image(pathFilenameImg2, img2);
// creating a Tensor for storing the data
tensorflow::Tensor input_tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({ batchSize, height, width, depth }));
auto input_tensor_mapped = input_tensor.tensor<float, 4>();
// Copy images data into the tensor:
for (int b = 0; b < batchSize; ++b) {
const float * source_data;
if (b == 0)
source_data = (float*)img1.data;
else
source_data = (float*)img2.data;
for (int y = 0; y < height; ++y) {
const float* source_row = source_data + (y * width * depth);
for (int x = 0; x < width; ++x) {
const float* source_pixel = source_row + (x * depth);
const float* source_B = source_pixel + 0;
const float* source_G = source_pixel + 1;
const float* source_R = source_pixel + 2;
input_tensor_mapped(b, y, x, 0) = *source_R;
input_tensor_mapped(b, y, x, 1) = *source_G;
input_tensor_mapped(b, y, x, 2) = *source_B;
}
}
}
// Load the graph:
tensorflow::GraphDef graph_def;
ReadBinaryProto(tensorflow::Env::Default(), graphFile, &graph_def);
// create a session with the graph
std::unique_ptr<tensorflow::Session> session_inception(tensorflow::NewSession(tensorflow::SessionOptions()));
session_inception->Create(graph_def);
// run the loaded graph
std::vector<tensorflow::Tensor> finalOutput;
session_inception->Run({ { InputName,input_tensor } }, { OutputName }, {}, &finalOutput);
// Get Top 5 classes:
std::cerr << "final output size = " << finalOutput.size() << std::endl;
tensorflow::Tensor output = std::move(finalOutput.at(0));
auto scores = output.flat<float>();
std::cerr << "scores size=" << scores.size() << std::endl;
std::ifstream label(labelfile);
std::string line;
std::vector<std::pair<float, std::string>> sorted;
for (unsigned int i = 0; i <= 1000; ++i) {
std::getline(label, line);
sorted.emplace_back(scores(i), line);
}
std::sort(sorted.begin(), sorted.end());
std::reverse(sorted.begin(), sorted.end());
std::cout << "size of the sorted file is " << sorted.size() << std::endl;
for (unsigned int i = 0; i< 5; ++i)
std::cout << "The output of the current graph has category " << sorted[i].second << " with probability " << sorted[i].first << std::endl;
}
Do I miss anything? Any ideas?
Thanks in advance!
I had the same problem. When I changed to the model used in https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/benchmark (differente version of inception) bigger batch sizes work correctly.
Notice you need to change the input size from 299,299,3 to 224,224,3 and the input and output layer names to: input:0 and output:0
Probably the graph in the protobuf file had a fixed batch size of 1 and I was only changing the shape of the input, not the graph. The graph has to accept a variable batch size by setting the shape to (None, width, heihgt, channels). This is done when you freeze the graph. Since the graph we have is already frozen, there is no way to change the batch size at this point.
I have a bgr image uchar format from opencv c++.
the function is like int* texture(int* data, int width, int height); the function processes the image in c++ end and returns the pointer to the data. How do I convert this data in Unity to texture. basically make this data available to be put as a texture. I dont want to write it to a file. Please help.
Code snippet (I am using dlls) :::
public static WebCamTexture webCamTexture;
private Color32[] data;
private int[] imageData;
private int[] imdat;
void Start () {
....
data = new Color32[webCamTexture.width * webCamTexture.height];
imageData = new int[data.Length * 3];
}
void Update()
{
webCamTexture.GetPixels32(data);
// Convert the Color32[] in int* and emit it in bgr format
for (int i = 0; i < data.Length; ++i)
{
imageData[i * 3] = (int)data[i].b;
imageData[i * 3 + 1] = (int)data[i].g;
imageData[i * 3 + 2] = (int)data[i].r;
}
//this is the function called from dll
imdat = texture(imageData, int width, int height);
}
And the DLL end looks like ::
char *tmp;
int* texture(int* imageData ,int width ,int height)
{
int n = w * h * 3;
tmp = new char[n];
//ImageData inverted here and then passed onto tmp 3 channels image
for (int i = 0; i < (w*3); ++i)
for (int j = 0; j < h; ++j)
tmp[i + j * (w*3)] = (char)imageData[i + (h - j - 1) * (w*3)];
return (int)tmp;
}
I'm not sure what format texture you have is, but if you can convert it into byte[] you can use Texture2D.LoadImage(byte[]) to turn in into working texture.
You should be able to achieve what to want with BitConverter.GetBytes() and Texture2D.LoadImage(). Make sure you take special note of the image format restrictions in the Unity manual page there.
Not sure how your binding between your C++land and C#land code but you should be able to do something a little like this:
/* iImportedTexture = [Your c++ function here]; */
byte[] bImportedTexture = BitConverter.GetBytes(iImportedTexture);
Texture2D importedTexture = Texture2D.LoadImage(bImportedTexture);
as input data I have a 24 bit RGB image and a palette with 2..20 fixed colours. These colours are in no way spread regularly over the full colour range.
Now I have to modify the colours of input image so that only the colours of the given palette are used - using the colour out of the palette that is closest to the original colour (not closest mathematically but for human's visual impression). So what I need is an algorithm that uses an input colour and finds the colour in target palette that visually fits best to this colour. Please note: I'm not looking for a stupid comparison/difference algorithm but for something that really incorporates the impression a colour has on humans!
Since this is something that already should have been done and because I do not want to re-invent the wheel again: is there some example source code out there that does this job? In best case it is really a piece of code and not a link to a desastrous huge library ;-)
(I'd guess OpenCV does not provide such a function?)
Thanks
You should look at the Lab color space. It was designed so that the distance in the colour space equals the perceptual distance. So once you have converted your image you can compute the distances as you would have done earlier, but should get a better result from a perceptual point of view. In OpenCV you can use the cvtColor(source, destination, CV_BGR2Lab) function.
Another Idea would be to use dithering. The idea is to mix missing colours using neighbouring pixels. A popular algorithm for this is Floyd-Steinberg dithering.
Here is an example of mine, where I combined a optimized palette using k-means with the Lab colourspace and floyd steinberg dithering:
#include <opencv2/opencv.hpp>
#include <iostream>
using namespace cv;
using namespace std;
cv::Mat floydSteinberg(cv::Mat img, cv::Mat palette);
cv::Vec3b findClosestPaletteColor(cv::Vec3b color, cv::Mat palette);
int main(int argc, char** argv)
{
// Number of clusters (colors on result image)
int nrColors = 18;
cv::Mat imgBGR = imread(argv[1],1);
cv::Mat img;
cvtColor(imgBGR, img, CV_BGR2Lab);
cv::Mat colVec = img.reshape(1, img.rows*img.cols); // change to a Nx3 column vector
cv::Mat colVecD;
colVec.convertTo(colVecD, CV_32FC3, 1.0); // convert to floating point
cv::Mat labels, centers;
cv::kmeans(colVecD, nrColors, labels,
cv::TermCriteria(CV_TERMCRIT_ITER, 100, 0.1),
3, cv::KMEANS_PP_CENTERS, centers); // compute k mean centers
// replace pixels by there corresponding image centers
cv::Mat imgPosterized = img.clone();
for(int i = 0; i < img.rows; i++ )
for(int j = 0; j < img.cols; j++ )
for(int k = 0; k < 3; k++)
imgPosterized.at<Vec3b>(i,j)[k] = centers.at<float>(labels.at<int>(j+img.cols*i),k);
// convert palette back to uchar
cv::Mat palette;
centers.convertTo(palette,CV_8UC3,1.0);
// call floyd steinberg dithering algorithm
cv::Mat fs = floydSteinberg(img, palette);
cv::Mat imgPosterizedBGR, fsBGR;
cvtColor(imgPosterized, imgPosterizedBGR, CV_Lab2BGR);
cvtColor(fs, fsBGR, CV_Lab2BGR);
imshow("input",imgBGR); // original image
imshow("result",imgPosterizedBGR); // posterized image
imshow("fs",fsBGR); // floyd steinberg dithering
waitKey();
return 0;
}
cv::Mat floydSteinberg(cv::Mat imgOrig, cv::Mat palette)
{
cv::Mat img = imgOrig.clone();
cv::Mat resImg = img.clone();
for(int i = 0; i < img.rows; i++ )
for(int j = 0; j < img.cols; j++ )
{
cv::Vec3b newpixel = findClosestPaletteColor(img.at<Vec3b>(i,j), palette);
resImg.at<Vec3b>(i,j) = newpixel;
for(int k=0;k<3;k++)
{
int quant_error = (int)img.at<Vec3b>(i,j)[k] - newpixel[k];
if(i+1<img.rows)
img.at<Vec3b>(i+1,j)[k] = min(255,max(0,(int)img.at<Vec3b>(i+1,j)[k] + (7 * quant_error) / 16));
if(i-1 > 0 && j+1 < img.cols)
img.at<Vec3b>(i-1,j+1)[k] = min(255,max(0,(int)img.at<Vec3b>(i-1,j+1)[k] + (3 * quant_error) / 16));
if(j+1 < img.cols)
img.at<Vec3b>(i,j+1)[k] = min(255,max(0,(int)img.at<Vec3b>(i,j+1)[k] + (5 * quant_error) / 16));
if(i+1 < img.rows && j+1 < img.cols)
img.at<Vec3b>(i+1,j+1)[k] = min(255,max(0,(int)img.at<Vec3b>(i+1,j+1)[k] + (1 * quant_error) / 16));
}
}
return resImg;
}
float vec3bDist(cv::Vec3b a, cv::Vec3b b)
{
return sqrt( pow((float)a[0]-b[0],2) + pow((float)a[1]-b[1],2) + pow((float)a[2]-b[2],2) );
}
cv::Vec3b findClosestPaletteColor(cv::Vec3b color, cv::Mat palette)
{
int i=0;
int minI = 0;
cv::Vec3b diff = color - palette.at<Vec3b>(0);
float minDistance = vec3bDist(color, palette.at<Vec3b>(0));
for (int i=0;i<palette.rows;i++)
{
float distance = vec3bDist(color, palette.at<Vec3b>(i));
if (distance < minDistance)
{
minDistance = distance;
minI = i;
}
}
return palette.at<Vec3b>(minI);
}
Try this algorithm (it will reduct color number, but it compute palette by itself):
#include <opencv2/opencv.hpp>
#include "opencv2/legacy/legacy.hpp"
#include <vector>
#include <list>
#include <iostream>
using namespace cv;
using namespace std;
void main(void)
{
// Number of clusters (colors on result image)
int NrGMMComponents = 32;
// Source file name
string fname="D:\\ImagesForTest\\tools.jpg";
cv::Mat SampleImg = imread(fname,1);
//cv::GaussianBlur(SampleImg,SampleImg,Size(5,5),3);
int SampleImgHeight = SampleImg.rows;
int SampleImgWidth = SampleImg.cols;
// Pick datapoints
vector<Vec3d> ListSamplePoints;
for (int y=0; y<SampleImgHeight; y++)
{
for (int x=0; x<SampleImgWidth; x++)
{
// Get pixel color at that position
Vec3b bgrPixel = SampleImg.at<Vec3b>(y, x);
uchar b = bgrPixel.val[0];
uchar g = bgrPixel.val[1];
uchar r = bgrPixel.val[2];
if(rand()%25==0) // Pick not every, bu t every 25-th
{
ListSamplePoints.push_back(Vec3d(b,g,r));
}
} // for (x)
} // for (y)
// Form training matrix
Mat labels;
int NrSamples = ListSamplePoints.size();
Mat samples( NrSamples, 3, CV_32FC1 );
for (int s=0; s<NrSamples; s++)
{
Vec3d v = ListSamplePoints.at(s);
samples.at<float>(s,0) = (float) v[0];
samples.at<float>(s,1) = (float) v[1];
samples.at<float>(s,2) = (float) v[2];
}
cout << "Learning to represent the sample distributions with" << NrGMMComponents << "gaussians." << endl;
// Algorithm parameters
CvEMParams params;
params.covs = NULL;
params.means = NULL;
params.weights = NULL;
params.probs = NULL;
params.nclusters = NrGMMComponents;
params.cov_mat_type = CvEM::COV_MAT_GENERIC; // DIAGONAL, GENERIC, SPHERICAL
params.start_step = CvEM::START_AUTO_STEP;
params.term_crit.max_iter = 1500;
params.term_crit.epsilon = 0.001;
params.term_crit.type = CV_TERMCRIT_ITER|CV_TERMCRIT_EPS;
//params.term_crit.type = CV_TERMCRIT_ITER;
// Train
cout << "Started GMM training" << endl;
CvEM em_model;
em_model.train( samples, Mat(), params, &labels );
cout << "Finished GMM training" << endl;
// Result image
Mat img = Mat::zeros( Size( SampleImgWidth, SampleImgHeight ), CV_8UC3 );
// Ask classifier for each pixel
Mat sample( 1, 3, CV_32FC1 );
Mat means;
means=em_model.getMeans();
for(int i = 0; i < img.rows; i++ )
{
for(int j = 0; j < img.cols; j++ )
{
Vec3b v=SampleImg.at<Vec3b>(i,j);
sample.at<float>(0,0) = (float) v[0];
sample.at<float>(0,1) = (float) v[1];
sample.at<float>(0,2) = (float) v[2];
int response = cvRound(em_model.predict( sample ));
img.at<Vec3b>(i,j)[0]=means.at<double>(response,0);
img.at<Vec3b>(i,j)[1]=means.at<double>(response,1);
img.at<Vec3b>(i,j)[2]=means.at<double>(response,2);
}
}
img.convertTo(img,CV_8UC3);
imshow("result",img);
waitKey();
// Save the result
cv::imwrite("result.png", img);
}
PS: For perceptive color distance measurement it's better to use L*a*b color space. There is converter in opencv for this purpose. For clustering you can use k-means with defined cluster centers (your palette entries). After clustering you'll get points with indexes of palette intries.
Is there a way to easily extract the DCT coefficients (and quantization parameters) from encoded images and video? Any decoder software must be using them to decode block-DCT encoded images and video. So I'm pretty sure the decoder knows what they are. Is there a way to expose them to whomever is using the decoder?
I'm implementing some video quality assessment algorithms that work directly in the DCT domain. Currently, the majority of my code uses OpenCV, so it would be great if anyone knows of a solution using that framework. I don't mind using other libraries (perhaps libjpeg, but that seems to be for still images only), but my primary concern is to do as little format-specific work as possible (I don't want to reinvent the wheel and write my own decoders). I want to be able to open any video/image (H.264, MPEG, JPEG, etc) that OpenCV can open, and if it's block DCT-encoded, to get the DCT coefficients.
In the worst case, I know that I can write up my own block DCT code, run the decompressed frames/images through it and then I'd be back in the DCT domain. That's hardly an elegant solution, and I hope I can do better.
Presently, I use the fairly common OpenCV boilerplate to open images:
IplImage *image = cvLoadImage(filename);
// Run quality assessment metric
The code I'm using for video is equally trivial:
CvCapture *capture = cvCaptureFromAVI(filename);
while (cvGrabFrame(capture))
{
IplImage *frame = cvRetrieveFrame(capture);
// Run quality assessment metric on frame
}
cvReleaseCapture(&capture);
In both cases, I get a 3-channel IplImage in BGR format. Is there any way I can get the DCT coefficients as well?
Well, I did a bit of reading and my original question seems to be an instance of wishful thinking.
Basically, it's not possible to get the DCT coefficients from H.264 video frames for the simple reason that H.264 doesn't use DCT. It uses a different transform (integer transform). Next, the coefficients for that transform don't necessarily change on a frame-by-frame basis -- H.264 is smarter cause it splits up frames into slices. It should be possible to get those coefficients through a special decoder, but I doubt OpenCV exposes it for the user.
For JPEG, things are a bit more positive. As I suspected, libjpeg exposes the DCT coefficients for you. I wrote a small app to show that it works (source at the end). It makes a new image using the DC term from each block. Because the DC term is equal to the block average (after proper scaling), the DC images are downsampled versions of the input JPEG image.
EDIT: fixed scaling in source
Original image (512 x 512):
DC images (64x64): luma Cr Cb RGB
Source (C++):
#include <stdio.h>
#include <assert.h>
#include <cv.h>
#include <highgui.h>
extern "C"
{
#include "jpeglib.h"
#include <setjmp.h>
}
#define DEBUG 0
#define OUTPUT_IMAGES 1
/*
* Extract the DC terms from the specified component.
*/
IplImage *
extract_dc(j_decompress_ptr cinfo, jvirt_barray_ptr *coeffs, int ci)
{
jpeg_component_info *ci_ptr = &cinfo->comp_info[ci];
CvSize size = cvSize(ci_ptr->width_in_blocks, ci_ptr->height_in_blocks);
IplImage *dc = cvCreateImage(size, IPL_DEPTH_8U, 1);
assert(dc != NULL);
JQUANT_TBL *tbl = ci_ptr->quant_table;
UINT16 dc_quant = tbl->quantval[0];
#if DEBUG
printf("DCT method: %x\n", cinfo->dct_method);
printf
(
"component: %d (%d x %d blocks) sampling: (%d x %d)\n",
ci,
ci_ptr->width_in_blocks,
ci_ptr->height_in_blocks,
ci_ptr->h_samp_factor,
ci_ptr->v_samp_factor
);
printf("quantization table: %d\n", ci);
for (int i = 0; i < DCTSIZE2; ++i)
{
printf("% 4d ", (int)(tbl->quantval[i]));
if ((i + 1) % 8 == 0)
printf("\n");
}
printf("raw DC coefficients:\n");
#endif
JBLOCKARRAY buf =
(cinfo->mem->access_virt_barray)
(
(j_common_ptr)cinfo,
coeffs[ci],
0,
ci_ptr->v_samp_factor,
FALSE
);
for (int sf = 0; (JDIMENSION)sf < ci_ptr->height_in_blocks; ++sf)
{
for (JDIMENSION b = 0; b < ci_ptr->width_in_blocks; ++b)
{
int intensity = 0;
intensity = buf[sf][b][0]*dc_quant/DCTSIZE + 128;
intensity = MAX(0, intensity);
intensity = MIN(255, intensity);
cvSet2D(dc, sf, (int)b, cvScalar(intensity));
#if DEBUG
printf("% 2d ", buf[sf][b][0]);
#endif
}
#if DEBUG
printf("\n");
#endif
}
return dc;
}
IplImage *upscale_chroma(IplImage *quarter, CvSize full_size)
{
IplImage *full = cvCreateImage(full_size, IPL_DEPTH_8U, 1);
cvResize(quarter, full, CV_INTER_NN);
return full;
}
GLOBAL(int)
read_JPEG_file (char * filename, IplImage **dc)
{
/* This struct contains the JPEG decompression parameters and pointers to
* working space (which is allocated as needed by the JPEG library).
*/
struct jpeg_decompress_struct cinfo;
struct jpeg_error_mgr jerr;
/* More stuff */
FILE * infile; /* source file */
/* In this example we want to open the input file before doing anything else,
* so that the setjmp() error recovery below can assume the file is open.
* VERY IMPORTANT: use "b" option to fopen() if you are on a machine that
* requires it in order to read binary files.
*/
if ((infile = fopen(filename, "rb")) == NULL) {
fprintf(stderr, "can't open %s\n", filename);
return 0;
}
/* Step 1: allocate and initialize JPEG decompression object */
cinfo.err = jpeg_std_error(&jerr);
/* Now we can initialize the JPEG decompression object. */
jpeg_create_decompress(&cinfo);
/* Step 2: specify data source (eg, a file) */
jpeg_stdio_src(&cinfo, infile);
/* Step 3: read file parameters with jpeg_read_header() */
(void) jpeg_read_header(&cinfo, TRUE);
/* We can ignore the return value from jpeg_read_header since
* (a) suspension is not possible with the stdio data source, and
* (b) we passed TRUE to reject a tables-only JPEG file as an error.
* See libjpeg.txt for more info.
*/
/* Step 4: set parameters for decompression */
/* In this example, we don't need to change any of the defaults set by
* jpeg_read_header(), so we do nothing here.
*/
jvirt_barray_ptr *coeffs = jpeg_read_coefficients(&cinfo);
IplImage *y = extract_dc(&cinfo, coeffs, 0);
IplImage *cb_q = extract_dc(&cinfo, coeffs, 1);
IplImage *cr_q = extract_dc(&cinfo, coeffs, 2);
IplImage *cb = upscale_chroma(cb_q, cvGetSize(y));
IplImage *cr = upscale_chroma(cr_q, cvGetSize(y));
cvReleaseImage(&cb_q);
cvReleaseImage(&cr_q);
#if OUTPUT_IMAGES
cvSaveImage("y.png", y);
cvSaveImage("cb.png", cb);
cvSaveImage("cr.png", cr);
#endif
*dc = cvCreateImage(cvGetSize(y), IPL_DEPTH_8U, 3);
assert(dc != NULL);
cvMerge(y, cr, cb, NULL, *dc);
cvReleaseImage(&y);
cvReleaseImage(&cb);
cvReleaseImage(&cr);
/* Step 7: Finish decompression */
(void) jpeg_finish_decompress(&cinfo);
/* We can ignore the return value since suspension is not possible
* with the stdio data source.
*/
/* Step 8: Release JPEG decompression object */
/* This is an important step since it will release a good deal of memory. */
jpeg_destroy_decompress(&cinfo);
fclose(infile);
return 1;
}
int
main(int argc, char **argv)
{
int ret = 0;
if (argc != 2)
{
fprintf(stderr, "usage: %s filename.jpg\n", argv[0]);
return 1;
}
IplImage *dc = NULL;
ret = read_JPEG_file(argv[1], &dc);
assert(dc != NULL);
IplImage *rgb = cvCreateImage(cvGetSize(dc), IPL_DEPTH_8U, 3);
cvCvtColor(dc, rgb, CV_YCrCb2RGB);
#if OUTPUT_IMAGES
cvSaveImage("rgb.png", rgb);
#else
cvNamedWindow("DC", CV_WINDOW_AUTOSIZE);
cvShowImage("DC", rgb);
cvWaitKey(0);
#endif
cvReleaseImage(&dc);
cvReleaseImage(&rgb);
return 0;
}
You can use, libjpeg to extract dct data of your jpeg file, but for h.264 video file, I can't find any open source code that give you dct data (actully Integer dct data). But you can use h.264 open source software like JM, JSVM or x264. In these two source file, you have to find their specific function that make use of dct function, and change it to your desire form, to get your output dct data.
For Image:
use the following code, and after read_jpeg_file( infilename, v, quant_tbl ), v and quant_tbl will have dct data and quantization table of your jpeg image respectively.
I used Qvector to store my output data, change it to your preferred c++ array list.
#include <iostream>
#include <stdio.h>
#include <jpeglib.h>
#include <stdlib.h>
#include <setjmp.h>
#include <fstream>
#include <QVector>
int read_jpeg_file( char *filename, QVector<QVector<int> > &dct_coeff, QVector<unsigned short> &quant_tbl)
{
struct jpeg_decompress_struct cinfo;
struct jpeg_error_mgr jerr;
FILE * infile;
if ((infile = fopen(filename, "rb")) == NULL) {
fprintf(stderr, "can't open %s\n", filename);
return 0;
}
cinfo.err = jpeg_std_error(&jerr);
jpeg_create_decompress(&cinfo);
jpeg_stdio_src(&cinfo, infile);
(void) jpeg_read_header(&cinfo, TRUE);
jvirt_barray_ptr *coeffs_array = jpeg_read_coefficients(&cinfo);
for (int ci = 0; ci < 1; ci++)
{
JBLOCKARRAY buffer_one;
JCOEFPTR blockptr_one;
jpeg_component_info* compptr_one;
compptr_one = cinfo.comp_info + ci;
for (int by = 0; by < compptr_one->height_in_blocks; by++)
{
buffer_one = (cinfo.mem->access_virt_barray)((j_common_ptr)&cinfo, coeffs_array[ci], by, (JDIMENSION)1, FALSE);
for (int bx = 0; bx < compptr_one->width_in_blocks; bx++)
{
blockptr_one = buffer_one[0][bx];
QVector<int> tmp;
for (int bi = 0; bi < 64; bi++)
{
tmp.append(blockptr_one[bi]);
}
dct_coeff.push_back(tmp);
}
}
}
// coantization table
j_decompress_ptr dec_cinfo = (j_decompress_ptr) &cinfo;
jpeg_component_info *ci_ptr = &dec_cinfo->comp_info[0];
JQUANT_TBL *tbl = ci_ptr->quant_table;
for(int ci =0 ; ci < 64; ci++){
quant_tbl.append(tbl->quantval[ci]);
}
return 1;
}
int main()
{
QVector<QVector<int> > v;
QVector<unsigned short> quant_tbl;
char *infilename = "your_image.jpg";
std::ofstream out;
out.open("out_dct.txt");
if( read_jpeg_file( infilename, v, quant_tbl ) > 0 ){
for(int j = 0; j < v.size(); j++ ){
for (int i = 0; i < v[0].size(); ++i){
out << v[j][i] << "\t";
}
out << "---------------" << std::endl;
}
out << "\n\n\n" << std::string(10,'-') << std::endl;
out << "\nQauntization Table:" << std::endl;
for(int i = 0; i < quant_tbl.size(); i++ ){
out << quant_tbl[i] << "\t";
}
}
else{
std::cout << "Can not read, Returned With Error";
return -1;
}
out.close();
return 0;
}