I'm trying to use Z3 via the C API and smtlib2 for incremental solving. Unfortunately, I got a segmentation violation when asserting some simple formula, checking it, obtaining its model, asserting something additional and then checking again. This also happens without asserting something new, i.e. when checking, retrieving a model, and checking again. Here is a minimal example to reproduce the error:
#include<z3.h>
int main()
{
Z3_config cfg = Z3_mk_config();
Z3_context ctx = Z3_mk_context(cfg);
Z3_ast fs = Z3_parse_smtlib2_string(ctx, "(declare-fun a () Int) (assert (= a 0))", 0, 0, 0, 0, 0, 0);
Z3_solver solver = Z3_mk_solver(ctx);
Z3_solver_assert(ctx, solver, fs);
Z3_solver_check(ctx, solver);
Z3_model m = Z3_solver_get_model(ctx, solver);
Z3_solver_check(ctx, solver);
Z3_del_config(cfg);
return 0;
}
I tried with two Z3 versions (4.3.1 on a Mac 64 bit and 4.1 on Ubuntu 64 bit).
I appreciate any help, hints or workarounds - maybe I'm just using the API in a wrong way?
Many thanks,
Elisabeth
Here is a version of your code using reference counts.
It crashes when I delete the reference counting.
void main() {
Z3_config cfg = Z3_mk_config();
Z3_context ctx = Z3_mk_context(cfg);
Z3_ast fs = Z3_parse_smtlib2_string(ctx, "(declare-fun a () Int) (assert (= a 0))", 0, 0, 0, 0, 0, 0);
Z3_inc_ref(ctx, fs);
Z3_solver solver = Z3_mk_solver(ctx);
Z3_solver_inc_ref(ctx, solver);
Z3_solver_assert(ctx, solver, fs);
Z3_solver_check(ctx, solver);
Z3_model m = Z3_solver_get_model(ctx, solver);
Z3_model_inc_ref(ctx, m);
Z3_solver_check(ctx, solver);
// work with model
Z3_solver_dec_ref(ctx, solver);
Z3_model_dec_ref(ctx, m);
Z3_dec_ref(ctx, fs);
Z3_del_config(cfg);
}
BTW. The C++ API hides all the reference counting details. It is much more convenient to work with.
Related
Unsigned integers can be compressed by using "bit-packing" techniques: Within a block of unsigned integers only the significant bits are stored, resulting in data compression when all integers in a block are "small". The method is known as FOR (frame of reference).
There are SIMD libraries that do this very efficiently.
Now I want to use FOR-like techniques to encode signed integers, e.g. from a differenced sequence of unsorted unsigned integers. The sign of each signed integer needs to be stored somewhere, there are two options:
Store the signs in a separate block of data. This adds overhead.
Store the sign together with the absolute value of each signed integer.
I'm following path 2 right now. The 2-s complement has the sign bit in the msb (most signfificant bit), so that won't work for bit-packing à la FOR. One possibility is to store the sign in the lsb (least significant bit). Storing signed integers this way is very unusual, there are no instruction that support this, as far as I know. The question now is: Can these lsb-signed-integers be encoded/decoded efficiently using SIMD instructions?
I think AVX-512 _mm_testn_epi32_mask can be used to extract the lsb from each uint32, followed by a shift, then two mask_extract of some sort? Quite convoluted.
Untested ZigZag examples in C using SSE2 for 64-bit integers:
(note: SSE2 is missing some 64-bit instructions...)
#include <emmintrin.h>
// from comment by Peter-Cordes
__m128i zigzag_encode_epi64(__m128i v) {
__m128i signmask = _mm_shuffle_epi32(v, _MM_SHUFFLE(3,3,1,1));
signmask = _mm_srai_epi32(signmask, 31);
return _mm_xor_si128(_mm_add_epi64(v, v), signmask);
}
__m128i zigzag_decode_epi64 (__m128i v) {
__m128i signmask = _mm_and_si128(_mm_set_epi32(0, 1, 0, 1), v);
signmask = _mm_sub_epi64(_mm_setzero_si128(), signmask);
return _mm_xor_si128(_mm_srli_epi64(v, 1), signmask);
}
// no constant
__m128i zigzag_decodev3_epi64 (__m128i v) {
__m128i t = _mm_srli_epi64(v, 1);
__m128i signmask = _mm_sub_epi64(_mm_slli_epi64(t, 1), v);
return _mm_xor_si128(t, signmask);
}
Zigzag is good for bitwise varints. However, a bytewise group-varint may wish to "sign extend from a variable bit-width".
32-bit examples
I favored compares over arithmetic shifts. I assume - when unrolled - compares will have 1 cycle lower latency.
__m128i zigzag_encode_epi32 (__m128i v) {
__m128i signmask =_mm_cmpgt_epi32(_mm_setzero_si128(), v);
return _mm_xor_si128(_mm_add_epi32(v, v), signmask);
}
__m128i zigzag_decode_epi32 (__m128i v) {
const __m128i m = _mm_set1_epi32(1);
__m128i signmask =_mm_cmpeq_epi32(_mm_and_si128(m, v), m);
return _mm_xor_si128(_mm_srli_epi32(v, 1), signmask);
}
__m128i delta_encode_epi32 (__m128i v, __m128i prev) {
return _mm_sub_epi32(v, _mm_alignr_epi8(v, prev, 12));
}
// prefix sum (see many of answers around stackoverflow...)
__m128i delta_decode_epi32 (__m128i v, __m128i prev) {
prev = _mm_shuffle_epi32(prev, _MM_SHUFFLE(3,3,3,3)); // [P P P P]
v = _mm_add_epi32(v, _mm_slli_si128(v, 4)); // [A AB BC CD]
prev = _mm_add_epi32(prev, v); // [PA PAB PBC PCD]
v = _mm_slli_si128(v, 8); // [0 0 A AB]
return _mm_add_epi32(prev, v); // [PA PAB PABC PABCD]
}
__m128i delta_zigzag_encode_epi32 (__m128i v, __m128i prev) {
return zigzag_encode_epi32(delta_encode_epi32(v, prev));
}
__m128i delta_zigzag_decode_epi32 (__m128i v, __m128i prev) {
return delta_decode_epi32(zigzag_decode_epi32(v), prev);
}
Note: Delta coding would be faster (round-trip/decoding) to transpose the elements while encoding then transpose them back again during decoding; horizontal prefix sums are really slow. However, determining the optimum number of elements to transpose in each batch seems like a hard problem.
In order to quickly save Lua tables containing large 1-dimensional arrays (the number of arrays is known however the number of elements isn't fixed. approximately 800,000 elements in each array), I planned to use Lua C binding in the following way-
#include "lua.h"
#include "lauxlib.h"
#include <stdio.h>
#include <assert.h>
static int save_table(lua_State *L) {
assert(L && lua_type(L, -1) == LUA_TTABLE);
int len, r;
void *ptr;
FILE *f;
lua_pushstring(L, "p");
lua_gettable(L, -2);
len = lua_objlen(L, -1);
ptr = lua_topointer(L, -1);
f = fopen("p.bin", "wb");
assert(f);
r = fwrite(ptr, sizeof(int), len, f);
printf("[p] wrote %d elements out of %d requested\n", r, len);
fclose(f);
lua_pop(L, 1);
lua_pushstring(L, "q");
lua_gettable(L, -2);
len = lua_objlen(L, -1);
ptr = lua_topointer(L, -1);
f = fopen("q.bin", "wb");
assert(f);
r = fwrite(ptr, sizeof(float), len, f);
printf("[q] wrote %d elements out of %d requested\n", r, len);
fclose(f);
lua_pop(L, 1);
return 1;
}
int luaopen_savetable(lua_State *L) {
static const luaL_reg Map[] = {{"save_table", save_table}, {NULL, NULL}};
luaL_register(L, "mytask", Map);
return 1;
}
Lua code is shown below-
-- sample table containg two 1-d array
my_table = {p = {11, 22, 33, 44}, q = {0.12, 0.23, 0.34, 0.45, 0.56}}
require "savetable"
mytask.save_table(my_table)
The above code produces two binary files with the wrong content. What is wrong here?
PS: I am using Lua 5.1. I am not sure if this is the fastest way of dumping large Lua tables. Suggestions are always welcome.
This question originates in a comment I almost wrote below this question, where Zack is computing the factorial of a large number modulo a large number (that we will assume to be prime for the sake of this question). Zack is using the traditional computation of factorial, taking the remainder at each multiplication.
I almost commented that an alternative to consider was Montgomery multiplication, but thinking more about it, I have only seen this technique used to speed up several multiplications by the same multiplicand (in particular, to speed up the computation of an mod p).
My question is: can Montgomery multiplication be used to speed up the computation of n! mod p for large n and p?
Naively, no; you need to transform each of the n terms of the product into the "Montgomery space", so you have n full reductions mod m, the same as the "usual" algorithm.
However, a factorial isn't just an arbitrary product of n terms; it's much more structured. In particular, if you already have the "Montgomerized" kr mod m, then you can use a very cheap reduction to get (k+1)r mod m.
So this is perfectly feasible, though I haven't seen it done before. I went ahead and wrote a quick-and-dirty implementation (very untested, I wouldn't trust it very far at all):
// returns m^-1 mod 2**64 via clever 2-adic arithmetic (http://arxiv.org/pdf/1209.6626.pdf)
uint64_t inverse(uint64_t m) {
assert(m % 2 == 1);
uint64_t minv = 2 - m;
uint64_t m_1 = m - 1;
for (int i=1; i<6; i+=1) { m_1 *= m_1; minv *= (1 + m_1); }
return minv;
}
uint64_t montgomery_reduce(__uint128_t x, uint64_t minv, uint64_t m) {
return x + (__uint128_t)((uint64_t)x*-minv)*m >> 64;
}
uint64_t montgomery_multiply(uint64_t x, uint64_t y, uint64_t minv, uint64_t m) {
return montgomery_reduce(full_product(x, y), minv, m);
}
uint64_t montgomery_factorial(uint64_t x, uint64_t m) {
assert(x < m && m % 2 == 1);
uint64_t minv = inverse(m); // m^-1 mod 2**64
uint64_t r_mod_m = -m % m; // 2**64 mod m
uint64_t mont_term = r_mod_m;
uint64_t mont_result = r_mod_m;
for (uint64_t k=2; k<=x; k++) {
// Compute the montgomerized product term: kr mod m = (k-1)r + r mod m.
mont_term += r_mod_m;
if (mont_term >= m) mont_term -= m;
// Update the result by multiplying in the new term.
mont_result = montgomery_multiply(mont_result, mont_term, minv, m);
}
// Final reduction
return montgomery_reduce(mont_result, minv, m);
}
and benchmarked it against the usual implementation:
__uint128_t full_product(uint64_t x, uint64_t y) {
return (__uint128_t)x*y;
}
uint64_t naive_factorial(uint64_t x, uint64_t m) {
assert(x < m);
uint64_t result = x ? x : 1;
while (x --> 2) result = full_product(result,x) % m;
return result;
}
and against the usual implementation with some inline asm to fix a minor inefficiency:
uint64_t x86_asm_factorial(uint64_t x, uint64_t m) {
assert(x < m);
uint64_t result = x ? x : 1;
while (x --> 2) {
__asm__("mov %[result], %%rax; mul %[x]; div %[m]"
: [result] "+d" (result) : [x] "r" (x), [m] "r" (m) : "%rax", "flags");
}
return result;
}
Results were as follows on my Haswell laptop for reasonably large x:
implementation speedup
---------------------------
naive 1.00x
x86_asm 1.76x
montgomery 5.68x
So this really does seem to be a pretty nice win. The codegen for the Montgomery implementation is pretty decent, but could probably be improved somewhat further with hand-written assembly as well.
This is an interesting approach for "modest" x and m. Once x gets large, the various approaches that have sub-linear complexity in x will necessarily win out; factorial has so much structure that this method doesn't take advantage of.
I want to verify my problem using the solver for 2 different constraints. I wrote a sample program for the same, where I have a variable x which I want to check and get a model for x = 0 and x = 1.
I am trying to use Push and Pop in the Solver. However I am not sure about how to do it exactly. I have written the following code. When I try to push the context and pop it back, I get a crash. I do not understand the reason for the crash, but its a Seg Fault. Even if I comment out the push and pop instructions as below, I am still getting the crash.
Could someone please give some pointers to solve the problem.
Z3_config cfg;
Z3_context ctx;
Z3_solver solver;
Z3_ast x, zero, one, x_eq_zero, x_eq_one;
cfg = Z3_mk_config();
ctx = Z3_mk_context(cfg);
Z3_del_config(cfg);
solver = Z3_mk_solver((Z3_context)ctx);
x = mk_int_var(ctx, "x");
zero = mk_int(ctx, 0);
one = mk_int(ctx, 1);
x_eq_zero = Z3_mk_eq(ctx, x, zero);
x_eq_one = Z3_mk_eq(ctx, x, one);
//Z3_solver_push (ctx, solver);
Z3_solver_assert(ctx, solver, x_eq_zero);
printf("Scopes : %d\n", Z3_solver_get_num_scopes((Z3_context) ctx, (Z3_solver) solver));
printf("%s \n", Z3_ast_to_string(ctx, x_eq_zero));
int result = Z3_solver_check ((Z3_context) ctx, (Z3_solver) solver);
printf("Sat Result : %d\n", result);
printf("Model : %s\n", Z3_model_to_string ((Z3_context) ctx, Z3_solver_get_model ((Z3_context) ctx, (Z3_solver) solver)));
// Z3_solver_pop (ctx, solver, 1);
// printf("Scopes : %d\n", Z3_solver_get_num_scopes((Z3_context) ctx, (Z3_solver) solver));
Z3_solver_assert(ctx, solver, x_eq_one);
result = Z3_solver_check ((Z3_context) ctx, (Z3_solver) solver);
printf("Sat Result : %d\n", result);
printf("Model : %s\n", Z3_model_to_string ((Z3_context) ctx, Z3_solver_get_model ((Z3_context) ctx, (Z3_solver) solver)));
return 0;
The new API in Z3 4.0 has many new features. For example, it introduces several new objects: Solvers, Goals, Tactics, Probes, etc. Moreover, we also introduce a new memory management policy for objects such as ASTs and Models that existed in previous APIs. The new memory management policy is based on reference counting. Every object has APIs of the form Z3_<object>_inc_ref and Z3_<object>_dec_ref. We still support the old memory management policy for ASTs and Models. If the Z3_context is created using Z3_mk_context, then the old memory management policy is enabled for ASTs. If it is created using Z3_mk_context_rc, then Z3_inc_ref and Z3_dec_ref must be used to manage the reference counters. However, the new objects (Solvers, Goals, Tactics, etc) only support reference counting. We strongly encourage all users to move to the new reference counting memory management policy. So, all new objects only support this policy. Moreover, all managed APIs (.Net, Python and OCaml) are based on the reference counting policy. Note that, we provide a thin C++ layer on top of the C API. It "hides" all reference counting calls using "smart pointers". The source code for the C++ layer is included in the Z3 distribution.
That being said, your program crashes because you did not increment the reference counter of the object Z3_solver. Here is the corrected version of your program. I essentially added the missing calls to Z3_solver_inc_ref and Z3_solver_dec_ref. The latter is needed to avoid a memory leak. After it, I also included the same program using the C++ API. It is much simpler. The C++ API is provided in the file include\z3++.h in the Z3 distribution. Examples are included at examples\c++.
Z3_config cfg;
Z3_context ctx;
Z3_solver solver;
Z3_ast x, zero, one, x_eq_zero, x_eq_one;
cfg = Z3_mk_config();
ctx = Z3_mk_context(cfg);
Z3_del_config(cfg);
solver = Z3_mk_solver((Z3_context)ctx);
Z3_solver_inc_ref(ctx, solver);
x = mk_int_var(ctx, "x");
zero = mk_int(ctx, 0);
one = mk_int(ctx, 1);
x_eq_zero = Z3_mk_eq(ctx, x, zero);
x_eq_one = Z3_mk_eq(ctx, x, one);
//Z3_solver_push (ctx, solver);
Z3_solver_assert(ctx, solver, x_eq_zero);
printf("Scopes : %d\n", Z3_solver_get_num_scopes((Z3_context) ctx, (Z3_solver) solver));
printf("%s \n", Z3_ast_to_string(ctx, x_eq_zero));
int result = Z3_solver_check ((Z3_context) ctx, (Z3_solver) solver);
printf("Sat Result : %d\n", result);
printf("Model : %s\n", Z3_model_to_string ((Z3_context) ctx, Z3_solver_get_model ((Z3_context) ctx, (Z3_solver) solver)));
// Z3_solver_pop (ctx, solver, 1);
// printf("Scopes : %d\n", Z3_solver_get_num_scopes((Z3_context) ctx, (Z3_solver) solver));
Z3_solver_assert(ctx, solver, x_eq_one);
result = Z3_solver_check ((Z3_context) ctx, (Z3_solver) solver);
printf("Sat Result : %d\n", result);
// printf("Model : %s\n", Z3_model_to_string ((Z3_context) ctx, Z3_solver_get_model ((Z3_context) ctx, (Z3_solver) solver)));
Z3_solver_dec_ref(ctx, solver);
return 0;
C++ version
context c;
solver s(c);
expr x = c.int_const("x");
expr x_eq_zero = x == 0;
expr x_eq_one = x == 1;
s.add(x_eq_zero);
std::cout << "Scopes : " << Z3_solver_get_num_scopes(c, s) << "\n";
std::cout << x_eq_zero << "\n";
std::cout << s.check() << "\n";
std::cout << s.get_model() << "\n";
s.add(x_eq_one);
std::cout << s.check() << "\n";
return 0;
I tried already GMP, MPFR. But I can't accomplish a simple division like below. BTW I have LLVM compiler in Xcode. I try to compile, run it to IOS Simulator.
mpf_t a;
mpf_init2 (a, 256);
mpf_set_d(a, 0.7);
mpf_t b; mpf_init2 (b, 256);
mpf_set_d(b, 1.0);
mpf_t l; mpf_init2 (l, 256);
gmp_printf ("%.*Ff \n", 5, a); --- 0.70000
gmp_printf ("%.*Ff \n", 5, b); --- 1.00000
mpf_div(l, a, b);
gmp_printf ("%.*Ff", 5, l); --- 0.52502
Have you tried MPIR? OpenSSL also provides a big number library...