Which library should I use on OSX for arbitrary precision arithmetic? - ios

I tried already GMP, MPFR. But I can't accomplish a simple division like below. BTW I have LLVM compiler in Xcode. I try to compile, run it to IOS Simulator.
mpf_t a;
mpf_init2 (a, 256);
mpf_set_d(a, 0.7);
mpf_t b; mpf_init2 (b, 256);
mpf_set_d(b, 1.0);
mpf_t l; mpf_init2 (l, 256);
gmp_printf ("%.*Ff \n", 5, a); --- 0.70000
gmp_printf ("%.*Ff \n", 5, b); --- 1.00000
mpf_div(l, a, b);
gmp_printf ("%.*Ff", 5, l); --- 0.52502

Have you tried MPIR? OpenSSL also provides a big number library...

Related

SIMD: Bit-pack signed integers

Unsigned integers can be compressed by using "bit-packing" techniques: Within a block of unsigned integers only the significant bits are stored, resulting in data compression when all integers in a block are "small". The method is known as FOR (frame of reference).
There are SIMD libraries that do this very efficiently.
Now I want to use FOR-like techniques to encode signed integers, e.g. from a differenced sequence of unsorted unsigned integers. The sign of each signed integer needs to be stored somewhere, there are two options:
Store the signs in a separate block of data. This adds overhead.
Store the sign together with the absolute value of each signed integer.
I'm following path 2 right now. The 2-s complement has the sign bit in the msb (most signfificant bit), so that won't work for bit-packing à la FOR. One possibility is to store the sign in the lsb (least significant bit). Storing signed integers this way is very unusual, there are no instruction that support this, as far as I know. The question now is: Can these lsb-signed-integers be encoded/decoded efficiently using SIMD instructions?
I think AVX-512 _mm_testn_epi32_mask can be used to extract the lsb from each uint32, followed by a shift, then two mask_extract of some sort? Quite convoluted.
Untested ZigZag examples in C using SSE2 for 64-bit integers:
(note: SSE2 is missing some 64-bit instructions...)
#include <emmintrin.h>
// from comment by Peter-Cordes
__m128i zigzag_encode_epi64(__m128i v) {
__m128i signmask = _mm_shuffle_epi32(v, _MM_SHUFFLE(3,3,1,1));
signmask = _mm_srai_epi32(signmask, 31);
return _mm_xor_si128(_mm_add_epi64(v, v), signmask);
}
__m128i zigzag_decode_epi64 (__m128i v) {
__m128i signmask = _mm_and_si128(_mm_set_epi32(0, 1, 0, 1), v);
signmask = _mm_sub_epi64(_mm_setzero_si128(), signmask);
return _mm_xor_si128(_mm_srli_epi64(v, 1), signmask);
}
// no constant
__m128i zigzag_decodev3_epi64 (__m128i v) {
__m128i t = _mm_srli_epi64(v, 1);
__m128i signmask = _mm_sub_epi64(_mm_slli_epi64(t, 1), v);
return _mm_xor_si128(t, signmask);
}
Zigzag is good for bitwise varints. However, a bytewise group-varint may wish to "sign extend from a variable bit-width".
32-bit examples
I favored compares over arithmetic shifts. I assume - when unrolled - compares will have 1 cycle lower latency.
__m128i zigzag_encode_epi32 (__m128i v) {
__m128i signmask =_mm_cmpgt_epi32(_mm_setzero_si128(), v);
return _mm_xor_si128(_mm_add_epi32(v, v), signmask);
}
__m128i zigzag_decode_epi32 (__m128i v) {
const __m128i m = _mm_set1_epi32(1);
__m128i signmask =_mm_cmpeq_epi32(_mm_and_si128(m, v), m);
return _mm_xor_si128(_mm_srli_epi32(v, 1), signmask);
}
__m128i delta_encode_epi32 (__m128i v, __m128i prev) {
return _mm_sub_epi32(v, _mm_alignr_epi8(v, prev, 12));
}
// prefix sum (see many of answers around stackoverflow...)
__m128i delta_decode_epi32 (__m128i v, __m128i prev) {
prev = _mm_shuffle_epi32(prev, _MM_SHUFFLE(3,3,3,3)); // [P P P P]
v = _mm_add_epi32(v, _mm_slli_si128(v, 4)); // [A AB BC CD]
prev = _mm_add_epi32(prev, v); // [PA PAB PBC PCD]
v = _mm_slli_si128(v, 8); // [0 0 A AB]
return _mm_add_epi32(prev, v); // [PA PAB PABC PABCD]
}
__m128i delta_zigzag_encode_epi32 (__m128i v, __m128i prev) {
return zigzag_encode_epi32(delta_encode_epi32(v, prev));
}
__m128i delta_zigzag_decode_epi32 (__m128i v, __m128i prev) {
return delta_decode_epi32(zigzag_decode_epi32(v), prev);
}
Note: Delta coding would be faster (round-trip/decoding) to transpose the elements while encoding then transpose them back again during decoding; horizontal prefix sums are really slow. However, determining the optimum number of elements to transpose in each batch seems like a hard problem.

Quickly dumping large tables passed from Lua to C

In order to quickly save Lua tables containing large 1-dimensional arrays (the number of arrays is known however the number of elements isn't fixed. approximately 800,000 elements in each array), I planned to use Lua C binding in the following way-
#include "lua.h"
#include "lauxlib.h"
#include <stdio.h>
#include <assert.h>
static int save_table(lua_State *L) {
assert(L && lua_type(L, -1) == LUA_TTABLE);
int len, r;
void *ptr;
FILE *f;
lua_pushstring(L, "p");
lua_gettable(L, -2);
len = lua_objlen(L, -1);
ptr = lua_topointer(L, -1);
f = fopen("p.bin", "wb");
assert(f);
r = fwrite(ptr, sizeof(int), len, f);
printf("[p] wrote %d elements out of %d requested\n", r, len);
fclose(f);
lua_pop(L, 1);
lua_pushstring(L, "q");
lua_gettable(L, -2);
len = lua_objlen(L, -1);
ptr = lua_topointer(L, -1);
f = fopen("q.bin", "wb");
assert(f);
r = fwrite(ptr, sizeof(float), len, f);
printf("[q] wrote %d elements out of %d requested\n", r, len);
fclose(f);
lua_pop(L, 1);
return 1;
}
int luaopen_savetable(lua_State *L) {
static const luaL_reg Map[] = {{"save_table", save_table}, {NULL, NULL}};
luaL_register(L, "mytask", Map);
return 1;
}
Lua code is shown below-
-- sample table containg two 1-d array
my_table = {p = {11, 22, 33, 44}, q = {0.12, 0.23, 0.34, 0.45, 0.56}}
require "savetable"
mytask.save_table(my_table)
The above code produces two binary files with the wrong content. What is wrong here?
PS: I am using Lua 5.1. I am not sure if this is the fastest way of dumping large Lua tables. Suggestions are always welcome.

interface OpenCV's Mat containers with blas for matrix multiplication

I am processing UHD (2160 x 3840) images.
One of the processing I do consist to process a Sobel filtering on X and Y axis then I have to multiply every output matrix by it's transpose and then I process the gradient image as the square root of the sum of the gradient.
So : S = sqrt( S_x * S_x^t + S_y * S_y^t).
Due to dimension of the image OpenCV take up to twenty seconds to process that without multithreading and ten with multithreading.
I know there OpenCV call OpenCL in order to speed up the filtering operations so I think it can take a long time in order to try to gain performance from the filtering step.
For the matrix multiplication I experience a kind of unstability from the OpenCV's OpenCL gemm kernel implementation.
So I would like to try to use OpenBLAS insted.
My questions are :
1.)
I wrote the following code but I face some issue for interface OpenCV's Mat objects :
template<class _Ty>
void mm(cv::Mat& A,cv::Mat& B,cv::Mat& C)
{
static_assert(true,"support matrix_multiply is only defined for floating precision numbers.");
}
template<>
inline void mm<float>(cv::Mat& A,cv::Mat& B,cv::Mat& C)
{
const int M = A.rows;
const int N = B.cols;
const int K = A.cols;
cblas_sgemm( CblasRowMajor ,// 1
CblasNoTrans, // 2 TRANSA
CblasNoTrans, // 3 TRANSB
M, // 4 M
N, // 5 N
K, // 6 K
1., // 7 ALPHA
A.ptr<float>(),//8 A
A.rows, //9 LDA
B.ptr<float>(),//10 B
B.rows, //11 LDB
0., //12 BETA
C.ptr<float>(),//13 C
C.rows); //14 LDC
}
template<>
inline void mm<double>(cv::Mat& A,cv::Mat& B,cv::Mat& C)
{
cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,A.rows,B.cols,A.cols,1.,A.ptr<double>(),A.rows,B.ptr<double>(),B.cols,0.,C.ptr<double>(),C.rows);
}
void matrix_multiply(cv::InputArray _src1, cv::InputArray _src2, cv::OutputArray _dst)
{
CV_DbgAssert( (_src1.isMat() || _src1.isUMat()) && (_src1.kind() == _src2.kind()) &&
(_src1.depth() == _src2.depth()) && (_src1.depth() == CV_32F) && (_src1.depth() == _src1.type()) &&
(_src1.rows() == _src2.cols())
);
cv::Mat src1 = _src1.getMat();
cv::Mat src2 = _src2.getMat();
cv::Mat dst;
bool cpy(false);
if(_dst.rows() == _src1.rows() && _dst.cols() == _src2.cols() && _dst.type() == _src1.type())
dst = _dst.getMat();
else
{
dst = cv::Mat::zeros(src1.rows,src2.cols,src1.type());
cpy = true;
}
if(cpy)
dst.copyTo(_dst);
}
I tried to organize the datas as specified here :
http://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3.html#gafe51bacb54592ff5de056acabd83c260
without succes.
This is my main issue
2.)
I was thinking in order to try to speed up a little my implementation to apply the divide and conquer approach illustrated here :
https://en.wikipedia.org/wiki/Matrix_multiplication_algorithm
But for only four submatrix.
Does any one tried some similar approach or got a better way to gain performance in matrix multiplication (without using GPU) ?
Thank you in advance for any help.
I found a solution to the question 1).
I based my first implementation on the documentation of the BLAS library.
BLAS has been written in Fortran language, in this language the index start at 1 and not at 0 like in C or C++.
Another thing is many libraries wrote in Fortran language organize their memory in column order (e.g. BLAS,LAPACK) rather than most of the C or C++ library (e.g. OpenCV) organize the memory in row order.
After taking these two properties in count I modified my code to :
template<class _Ty>
void mm(cv::Mat& A,cv::Mat& B,cv::Mat& C)
{
static_assert(true,"The function gemm is only defined for floating precision numbers.");
}
template<>
void mm<float>(cv::Mat& A,cv::Mat& B,cv::Mat& C)
{
const int M = A.cols+1;
const int N = B.rows;
const int K = A.cols;
cblas_sgemm( CblasRowMajor ,// 1
CblasNoTrans, // 2 TRANSA
CblasNoTrans, // 3 TRANSB
M, // 4 M
N, // 5 N
K, // 6 K
1., // 7 ALPHA
A.ptr<float>(),//8 A
A.step1(), //9 LDA
B.ptr<float>(),//10 B
B.step1(), //11 LDB
0., //12 BETA
C.ptr<float>(),//13 C
C.step1()); //14 LDC
}
template<>
void mm<double>(cv::Mat& A,cv::Mat& B,cv::Mat& C)
{
const int M = A.cols+1;
const int N = B.rows;
const int K = A.cols;
cblas_dgemm( CblasRowMajor ,// 1
CblasNoTrans, // 2 TRANSA
CblasNoTrans, // 3 TRANSB
M, // 4 M
N, // 5 N
K, // 6 K
1., // 7 ALPHA
A.ptr<double>(),//8 A
A.step1(), //9 LDA
B.ptr<double>(),//10 B
B.step1(), //11 LDB
0., //12 BETA
C.ptr<double>(),//13 C
C.step1()); //14 LDC
}
And every thing work well.
Without additional multithreading or divide and conquer approach I was able to reduce the processing time of one step of my code from 150 ms to 500 us.
So it fix every thing for me :).

check; get_model; check causes segfault in Z3 C API

I'm trying to use Z3 via the C API and smtlib2 for incremental solving. Unfortunately, I got a segmentation violation when asserting some simple formula, checking it, obtaining its model, asserting something additional and then checking again. This also happens without asserting something new, i.e. when checking, retrieving a model, and checking again. Here is a minimal example to reproduce the error:
#include<z3.h>
int main()
{
Z3_config cfg = Z3_mk_config();
Z3_context ctx = Z3_mk_context(cfg);
Z3_ast fs = Z3_parse_smtlib2_string(ctx, "(declare-fun a () Int) (assert (= a 0))", 0, 0, 0, 0, 0, 0);
Z3_solver solver = Z3_mk_solver(ctx);
Z3_solver_assert(ctx, solver, fs);
Z3_solver_check(ctx, solver);
Z3_model m = Z3_solver_get_model(ctx, solver);
Z3_solver_check(ctx, solver);
Z3_del_config(cfg);
return 0;
}
I tried with two Z3 versions (4.3.1 on a Mac 64 bit and 4.1 on Ubuntu 64 bit).
I appreciate any help, hints or workarounds - maybe I'm just using the API in a wrong way?
Many thanks,
Elisabeth
Here is a version of your code using reference counts.
It crashes when I delete the reference counting.
void main() {
Z3_config cfg = Z3_mk_config();
Z3_context ctx = Z3_mk_context(cfg);
Z3_ast fs = Z3_parse_smtlib2_string(ctx, "(declare-fun a () Int) (assert (= a 0))", 0, 0, 0, 0, 0, 0);
Z3_inc_ref(ctx, fs);
Z3_solver solver = Z3_mk_solver(ctx);
Z3_solver_inc_ref(ctx, solver);
Z3_solver_assert(ctx, solver, fs);
Z3_solver_check(ctx, solver);
Z3_model m = Z3_solver_get_model(ctx, solver);
Z3_model_inc_ref(ctx, m);
Z3_solver_check(ctx, solver);
// work with model
Z3_solver_dec_ref(ctx, solver);
Z3_model_dec_ref(ctx, m);
Z3_dec_ref(ctx, fs);
Z3_del_config(cfg);
}
BTW. The C++ API hides all the reference counting details. It is much more convenient to work with.

Z3 4.0 Push and Pop In Solver

I want to verify my problem using the solver for 2 different constraints. I wrote a sample program for the same, where I have a variable x which I want to check and get a model for x = 0 and x = 1.
I am trying to use Push and Pop in the Solver. However I am not sure about how to do it exactly. I have written the following code. When I try to push the context and pop it back, I get a crash. I do not understand the reason for the crash, but its a Seg Fault. Even if I comment out the push and pop instructions as below, I am still getting the crash.
Could someone please give some pointers to solve the problem.
Z3_config cfg;
Z3_context ctx;
Z3_solver solver;
Z3_ast x, zero, one, x_eq_zero, x_eq_one;
cfg = Z3_mk_config();
ctx = Z3_mk_context(cfg);
Z3_del_config(cfg);
solver = Z3_mk_solver((Z3_context)ctx);
x = mk_int_var(ctx, "x");
zero = mk_int(ctx, 0);
one = mk_int(ctx, 1);
x_eq_zero = Z3_mk_eq(ctx, x, zero);
x_eq_one = Z3_mk_eq(ctx, x, one);
//Z3_solver_push (ctx, solver);
Z3_solver_assert(ctx, solver, x_eq_zero);
printf("Scopes : %d\n", Z3_solver_get_num_scopes((Z3_context) ctx, (Z3_solver) solver));
printf("%s \n", Z3_ast_to_string(ctx, x_eq_zero));
int result = Z3_solver_check ((Z3_context) ctx, (Z3_solver) solver);
printf("Sat Result : %d\n", result);
printf("Model : %s\n", Z3_model_to_string ((Z3_context) ctx, Z3_solver_get_model ((Z3_context) ctx, (Z3_solver) solver)));
// Z3_solver_pop (ctx, solver, 1);
// printf("Scopes : %d\n", Z3_solver_get_num_scopes((Z3_context) ctx, (Z3_solver) solver));
Z3_solver_assert(ctx, solver, x_eq_one);
result = Z3_solver_check ((Z3_context) ctx, (Z3_solver) solver);
printf("Sat Result : %d\n", result);
printf("Model : %s\n", Z3_model_to_string ((Z3_context) ctx, Z3_solver_get_model ((Z3_context) ctx, (Z3_solver) solver)));
return 0;
The new API in Z3 4.0 has many new features. For example, it introduces several new objects: Solvers, Goals, Tactics, Probes, etc. Moreover, we also introduce a new memory management policy for objects such as ASTs and Models that existed in previous APIs. The new memory management policy is based on reference counting. Every object has APIs of the form Z3_<object>_inc_ref and Z3_<object>_dec_ref. We still support the old memory management policy for ASTs and Models. If the Z3_context is created using Z3_mk_context, then the old memory management policy is enabled for ASTs. If it is created using Z3_mk_context_rc, then Z3_inc_ref and Z3_dec_ref must be used to manage the reference counters. However, the new objects (Solvers, Goals, Tactics, etc) only support reference counting. We strongly encourage all users to move to the new reference counting memory management policy. So, all new objects only support this policy. Moreover, all managed APIs (.Net, Python and OCaml) are based on the reference counting policy. Note that, we provide a thin C++ layer on top of the C API. It "hides" all reference counting calls using "smart pointers". The source code for the C++ layer is included in the Z3 distribution.
That being said, your program crashes because you did not increment the reference counter of the object Z3_solver. Here is the corrected version of your program. I essentially added the missing calls to Z3_solver_inc_ref and Z3_solver_dec_ref. The latter is needed to avoid a memory leak. After it, I also included the same program using the C++ API. It is much simpler. The C++ API is provided in the file include\z3++.h in the Z3 distribution. Examples are included at examples\c++.
Z3_config cfg;
Z3_context ctx;
Z3_solver solver;
Z3_ast x, zero, one, x_eq_zero, x_eq_one;
cfg = Z3_mk_config();
ctx = Z3_mk_context(cfg);
Z3_del_config(cfg);
solver = Z3_mk_solver((Z3_context)ctx);
Z3_solver_inc_ref(ctx, solver);
x = mk_int_var(ctx, "x");
zero = mk_int(ctx, 0);
one = mk_int(ctx, 1);
x_eq_zero = Z3_mk_eq(ctx, x, zero);
x_eq_one = Z3_mk_eq(ctx, x, one);
//Z3_solver_push (ctx, solver);
Z3_solver_assert(ctx, solver, x_eq_zero);
printf("Scopes : %d\n", Z3_solver_get_num_scopes((Z3_context) ctx, (Z3_solver) solver));
printf("%s \n", Z3_ast_to_string(ctx, x_eq_zero));
int result = Z3_solver_check ((Z3_context) ctx, (Z3_solver) solver);
printf("Sat Result : %d\n", result);
printf("Model : %s\n", Z3_model_to_string ((Z3_context) ctx, Z3_solver_get_model ((Z3_context) ctx, (Z3_solver) solver)));
// Z3_solver_pop (ctx, solver, 1);
// printf("Scopes : %d\n", Z3_solver_get_num_scopes((Z3_context) ctx, (Z3_solver) solver));
Z3_solver_assert(ctx, solver, x_eq_one);
result = Z3_solver_check ((Z3_context) ctx, (Z3_solver) solver);
printf("Sat Result : %d\n", result);
// printf("Model : %s\n", Z3_model_to_string ((Z3_context) ctx, Z3_solver_get_model ((Z3_context) ctx, (Z3_solver) solver)));
Z3_solver_dec_ref(ctx, solver);
return 0;
C++ version
context c;
solver s(c);
expr x = c.int_const("x");
expr x_eq_zero = x == 0;
expr x_eq_one = x == 1;
s.add(x_eq_zero);
std::cout << "Scopes : " << Z3_solver_get_num_scopes(c, s) << "\n";
std::cout << x_eq_zero << "\n";
std::cout << s.check() << "\n";
std::cout << s.get_model() << "\n";
s.add(x_eq_one);
std::cout << s.check() << "\n";
return 0;

Resources