How Can I Jump from FreeRTOS App1 to FreeRTOS App2? - freertos

When I jump from FREERTOS App1 To FreeRTOS App2 the program is stuck in the default handler.
FreeRTOS App1 boundary - 0x0 To 0x14000
FreeRTOS App2 boundary - 0x15000 To 0x30000
#define FAPP2_ADDRESS ((uint32_t)0x15000)
#define FAPP2_SIZE ((uint32_t)0x15000)
#define MSP_SPMPU (0)
#define PSP_SPMPU (1)
void app_jumpTo(uint32_t jumpLocation)
{
if (jumpLocation != 0xFFFFFFFF)
{
__set_MSP (*(uint32_t*) jumpLocation);
__set_PC (*(uint32_t*) (jumpLocation + 4));
}
}
void jump_to_app2(void)
{
xTimerStop(g_rtos_timer0, 0);
R_IOPORT_Close (&g_ioport_ctrl);
__disable_irq ();
memset ((uint32_t*) NVIC->ICER, 0xFF, sizeof(NVIC->ICER));
memset ((uint32_t*) NVIC->ICPR, 0xFF, sizeof(NVIC->ICPR));
SysTick->CTRL = 0;
SCB->ICSR |= SCB_ICSR_PENDSTCLR_Msk;
SCB->VTOR = (uint32_t) FAPP2_ADDRESS;
/* Disable the HW Stack monitor for MSP and PSP */
R_MPU_SPMON->SP[MSP_SPMPU].PT = 0xA500;
R_MPU_SPMON->SP[PSP_SPMPU].PT = 0xA500;
R_MPU_SPMON->SP[MSP_SPMPU].CTL = 0x0000;
R_MPU_SPMON->SP[PSP_SPMPU].CTL = 0x0000;
//Jump to our application entry point
app_jumpTo ((uint32_t) FAPP2_ADDRESS);
}
Fault Status Windows shows
HFSR 0x40000000
MMFSR 0x0
UFSR 0x2

Related

ESP8266-12F WiFi soft AP config.authmode failed

For a project I try do use the ESP8266 RTOS SDK.
First step I install the tools and the toolchain. The hello_world example and the other gpio example works fine. I try the softAP example and get a Guru Meditation Error: Core 0 panic'ed (StoreProhibited). Exception was unhandled Error. I figured out that the line 62 : .automode = WIFI_AUTH_WPA_WPA2_PSK not works. I tried WIFI_AUTH_WEP,WIFI_AUTH_WPA_PSK,WIFI_AUTH_WPA2_PSK but only with WIFI_AUTH_OPEN the softAP works. Anyone same behavior or some tips?
Console Trace:
ets Jan 8 2013,rst cause:1, boot mode:(3,6)
load 0x40100000, len 7040, room 16
tail 0
chksum 0xe5
load 0x3ffe8408, len 24, room 8
tail 0
chksum 0x6c
load 0x3ffe8420, len 3312, room 8
tail 8
chksum 0x75
csum 0x75
I (123) boot: ESP-IDF v3.4-rc 2nd stage bootloader
I (123) boot: compile time 19:41:32
I (207) qio_mode: Enabling default flash chip QIO
I (207) boot: SPI Speed : 40MHz
I (208) boot: SPI Mode : QOUT
I (212) boot: SPI Flash Size : 2MB
I (219) boot: Partition Table:
I (224) boot: ## Label Usage Type ST Offset Length
I (236) boot: 0 nvs WiFi data 01 02 00009000 00006000
I (247) boot: 1 phy_init RF data 01 01 0000f000 00001000
I (259) boot: 2 factory factory app 00 00 00010000 000f0000
I (271) boot: End of partition table
I (277) esp_image: segment 0: paddr=0x00010010 vaddr=0x40210010 size=0x52c80 (339072) map
I (406) esp_image: segment 1: paddr=0x00062c98 vaddr=0x40262c90 size=0x0f594 ( 62868) map
I (428) esp_image: segment 2: paddr=0x00072234 vaddr=0x3ffe8000 size=0x005fc ( 1532) load
I (429) esp_image: segment 3: paddr=0x00072838 vaddr=0x40100000 size=0x00080 ( 128) load
I (439) esp_image: segment 4: paddr=0x000728c0 vaddr=0x40100080 size=0x05560 ( 21856) load
I (460) boot: Loaded app from partition at offset 0x10000
I (481) wifi softAP: ESP_WIFI_MODE_AP
I (484) system_api: Base MAC address is not set, read default base MAC address from EFUSE
I (486) system_api: Base MAC address is not set, read default base MAC address from EFUSE
phy_version: 1163.0, 665d56c, Jun 24 2020, 10:00:08, RTOS new
I (557) phy_init: phy ver: 1163_0
I (567) wifi softAP: ----------------###------------
ESP_ERROR_CHECK failed: esp_err_t 0x2 (ERROR) at 0x4021f7cc
file: "softap_example_main.c" line 73
func: wifi_init_softap
expression: esp_wifi_set_config(ESP_IF_WIFI_AP, &wifi_config)
abort() was called at PC 0x4021f7cf on core 0
Guru Meditation Error: Core 0 panic'ed (StoreProhibited). Exception was unhandled.
Core 0 register dump:
PC : 0x40221c72 PS : 0x00000030 A0 : 0x40221c70 A1 : 0x3ffeb550
A2 : 0x00000000 A3 : 0xffffffdb A4 : 0x00000001 A5 : 0x00000001
A6 : 0x00000000 A7 : 0x4026663c A8 : 0x00000020 A9 : 0x00000000
A10 : 0x00000008 A11 : 0x00000000 A12 : 0x00000000 A13 : 0x00000000
A14 : 0x00000000 A15 : 0x00000000 SAR : 0x0000001e EXCCAUSE: 0x0000001d
Backtrace: 0x40221c72:0x3ffeb550 0x4021f7d2:0x3ffeb560 0x4022182e:0x3ffeb570 0x40221894:0x3ffeb630 0x402118ef:0x3ffeb640
Example Code from GitHub: (examples/wifi/getting_started/softAP/main/softap_example_main.c)
/* WiFi softAP Example
This example code is in the Public Domain (or CC0 licensed, at your option.)
Unless required by applicable law or agreed to in writing, this
software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied.
*/
#include <string.h>
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include "esp_system.h"
#include "esp_wifi.h"
#include "esp_event.h"
#include "esp_log.h"
#include "nvs_flash.h"
#include "lwip/err.h"
#include "lwip/sys.h"
/* The examples use WiFi configuration that you can set via project configuration menu.
If you'd rather not, just change the below entries to strings with
the config you want - ie #define EXAMPLE_WIFI_SSID "mywifissid"
*/
#define EXAMPLE_ESP_WIFI_SSID CONFIG_ESP_WIFI_SSID
#define EXAMPLE_ESP_WIFI_PASS CONFIG_ESP_WIFI_PASSWORD
#define EXAMPLE_MAX_STA_CONN CONFIG_ESP_MAX_STA_CONN
static const char *TAG = "wifi softAP";
static void wifi_event_handler(void* arg, esp_event_base_t event_base,
int32_t event_id, void* event_data)
{
if (event_id == WIFI_EVENT_AP_STACONNECTED) {
wifi_event_ap_staconnected_t* event = (wifi_event_ap_staconnected_t*) event_data;
ESP_LOGI(TAG, "station "MACSTR" join, AID=%d",
MAC2STR(event->mac), event->aid);
} else if (event_id == WIFI_EVENT_AP_STADISCONNECTED) {
wifi_event_ap_stadisconnected_t* event = (wifi_event_ap_stadisconnected_t*) event_data;
ESP_LOGI(TAG, "station "MACSTR" leave, AID=%d",
MAC2STR(event->mac), event->aid);
}
}
void wifi_init_softap()
{
tcpip_adapter_init();
ESP_ERROR_CHECK(esp_event_loop_create_default());
wifi_init_config_t cfg = WIFI_INIT_CONFIG_DEFAULT();
ESP_ERROR_CHECK(esp_wifi_init(&cfg));
ESP_ERROR_CHECK(esp_event_handler_register(WIFI_EVENT, ESP_EVENT_ANY_ID, &wifi_event_handler, NULL));
wifi_config_t wifi_config = {
.ap = {
.ssid = EXAMPLE_ESP_WIFI_SSID,
.ssid_len = strlen(EXAMPLE_ESP_WIFI_SSID),
.password = EXAMPLE_ESP_WIFI_PASS,
.max_connection = EXAMPLE_MAX_STA_CONN,
.authmode = WIFI_AUTH_WPA_WPA2_PSK
},
};
if (strlen(EXAMPLE_ESP_WIFI_PASS) == 0) {
wifi_config.ap.authmode = WIFI_AUTH_OPEN;
}
ESP_ERROR_CHECK(esp_wifi_set_mode(WIFI_MODE_AP));
ESP_ERROR_CHECK(esp_wifi_set_config(ESP_IF_WIFI_AP, &wifi_config));
ESP_ERROR_CHECK(esp_wifi_start());
ESP_LOGI(TAG, "wifi_init_softap finished. SSID:%s password:%s",
EXAMPLE_ESP_WIFI_SSID, EXAMPLE_ESP_WIFI_PASS);
}
void app_main()
{
ESP_ERROR_CHECK(nvs_flash_init());
ESP_LOGI(TAG, "ESP_WIFI_MODE_AP");
wifi_init_softap();
}
You have to run idf.py menuconfig and set the SSID and password values.
Password´s under 8 characters gets a Guru-Meditation ERR if the
wifi_config.ap.authmode = WIFI_AUTH_WEP or,
wifi_config.ap.authmode = WIFI_AUTH_WPA_PSK or,
wifi_config.ap.authmode = WIFI_AUTH_WPA2_PSK . Other authmodes not yet testet.
To provid this Error, make sure your pw has more then 7 characters or/and set
the If-Condition after the wifi_config to:
if (strlen(EXAMPLE_ESP_WIFI_PASS) < 8) {
wifi_config.ap.authmode = WIFI_AUTH_OPEN;
ESP_LOGI(TAG," pw-lenght under 8 charcters. Set WIFI_AUTH_OPEN");
}
Your WiFi is not protectet but your µC not crashes.

Allocating boot time memory using dts configuration

I am working with a particular encoder which uses 512M boot time allocated memory. I am supposed to allocate the memory through dts file configuration. The physical memory is devided into low and high as below in the dts file:
/* 256MB at 0x0 */
memory#0 {
device_type = "memory";
reg = <0x0 0x00000000 0x0 0x10000000>;
};
/* 2GB at 0x8010000000 */
memory#8000000000 {
device_type = "memory";
reg = <0x80 0x10000000 0x0 0x80000000>;
};
Now I want to allocate the boot time carved out memory from the high memory. I can think of creating a dts entry as below
encoder: encoder#0xxxxxxxxx {
compatible = "xyz, abc";
reg= <0x0 0x80000000 0x0 0x20000000>;
}
Here the encoder#xxxx is the actual device entry and it has already a set of register region as below:
encoder: encoder#0xxxxxxxxx{
compatible = "xyz, abc";
#address-cells = <1>;
#size-cells = <1>;
reg = <0x0 0xxxxxxxxxx 0x0 0x1234>;
status = "disabled";
};
So after adding the carvedout memory the entry would look like this:
encoder: encoder#0xxxxxxxxx{
compatible = "xyz, abc";
#address-cells = <1>;
#size-cells = <1>;
reg = <0x0 0xxxxxxxxxx 0x0 0x1234>;
reg= <0x0 0x80000000 0x0 0x20000000>;
status = "disabled";
};
Would this work? I am not sure though how the driver code would know where is the start address of the carved out memory and the size of it?
Can anyone please help?
Thanks.
I have achieved this by creating curving out memory from the lower part of the high memory for my device. To do this I modified my dtsi file to reflect the mew memory map as below:
/* 256MB at 0x0 - this is the low memory region*/
memory#0 {
device_type = "memory";
reg = <0x0 0x00000000 0x0 0x10000000>;
};
/* 1GB at 0x8010000000 - high memory region, this used to be 2GB earlier. I curved out to 1GB and allocated 1GB to my device*/
memory#8000000000 {
device_type = "memory";
reg = <0x80 0x10000000 0x0 0x40000000>;
};
/* 1GB carved out for my device.*/
my_device: mydevice#8050000000 {
compatible = "xxx,xxx-yyy";
reg = <0x00 0x20810000 0x0 0x010000>,
**<0x80 0x50000000 0x0 0x40000000>;**
interrupts = <GIC_SHARED xx IRQ_TYPE_LEVEL_HIGH>;
status = "disabled";
};
Allocation of 1GB was a bit too much and I have modified to it to 256MB but that is not important here.
Then in the driver I retried the memory details as below:
struct resource *res;
res = platform_get_resource(pdev, IORESOURCE_MEM, 0); /*this is to get hold of the the registers of my device*/
res = platform_get_resource(pdev, IORESOURCE_MEM, 1); /* this is the device memory for my device which had been set aside in the dtsi file above.*/
For using the memories retrieve as above use the below:
res->start /* start address of the memory region */
res->end - res->start + 1 /* this is to calculate the size of the memory*/
devm_ioremap_resource(&pdev->dev, res); /* this will give the mapped kernel virtual address for the memory bank*/

mprotect errno 22 iOS

I'm developing a jailbroken app on iOS and getting errno 22 when calling
mprotect(p, 1024, PROT_READ | PROT_EXEC)
errno 22 means invalid arguments but I can't figure out whats wrong. I've aligned p to be a multiple of page size, and I've malloced the memory previously before calling mprotect.
Here's my code and sample output
#define PAGESIZE 4096
FILE * pFile;
pFile = fopen ("log.txt","w");
uint32_t code[] = {
0xe2800001, // add r0, r0, #1
0xe12fff1e, // bx lr
};
fprintf(pFile, "Before Execution\n");
p = (uint32_t *)malloc(1024+PAGESIZE-1);
if (!p) {
fprintf(pFile, "Couldn't malloc(1024)");
perror("Couldn't malloc(1024)");
exit(errno);
}
fprintf(pFile, "Malloced to %p\n", p);
p = (uint32_t *)(((uintptr_t)p + PAGESIZE-1) & ~(PAGESIZE-1));
fprintf(pFile, "Moved pointer to %p\n", p);
fprintf(pFile, "Before Compiling\n");
// copy instructions to function
p[0] = code[0];
p[1] = code[1];
fprintf(pFile, "After Compiling\n");
if (mprotect(p, 1024, PROT_READ | PROT_EXEC)) {
int err = errno;
fprintf(pFile, "Couldn't mprotect2: %i\n", errno);
perror("Couldn't mprotect");
exit(errno);
}
And output:
Before Execution
Malloced to 0x13611ec00
Moved pointer 0x13611f000
Before Compiling
After Compiling
Couldn't mprotect2: 22
Fixed this by using posix_memalign(). Turns out I wasn't aligning my pointer to the page size correctly

iOS 64bit ntpdate don't work properly with arm64 flag

I have a function (ANSI C) to retrieve time for our ntpd server.
This code work properly when I compile 32bit but doesn't work if I compile in armv64.
It works properly on iPhone 4,4S,5 (32bit), it doesn't work properly on Iphone 5s,6,6S (64bit).
I think that the problem is:
tmit=ntohl((time_t)buf[10]); //# get transmit time
time_t is now 8byte when compiled in armv64.....
Underneath you can find the source code...
Output Correct with Iphone 5 Simulator (32bit) ---------------------------
xxx.xxx.xxx.xxx PORT 123
sendto-->48
prima recv
recv-->48
tmit=-661900093
tmit=1424078403
1424078403-->Time: Mon Feb 16 10:20:03 2015
10:20:03 --> 37203
---------------------------------------------------------
Output Wrong with Iphone 6 Simulator (64bit) ---------------------------
xxx.xxx.xxx.xxx PORT 123
sendto-->48
prima recv
recv-->48
tmit=19612797
tmit=2105591293
2105591293-->Time: Tue Nov 19 00:47:09 38239
00:47:09 --> 2829
//---------------------------------------------------------------------------
long ntpdate(char *hostname) {
//ntp1.inrim.it (193.204.114.232)
//ntp2.inrim.it (193.204.114.233)
int portno=NTP_PORT; //NTP is port 123
int maxlen=1024; //check our buffers
int i=0; // misc var i
unsigned char msg[48]={010,0,0,0,0,0,0,0,0}; // the packet we send
unsigned long buf[maxlen]; // the buffer we get back
//struct in_addr ipaddr; //
struct protoent *proto; //
struct sockaddr_in server_addr;
int s; // socket
int tmit; // the time -- This is a time_t sort of
char ora[20]="";
//
//#we use the system call to open a UDP socket
//socket(SOCKET, PF_INET, SOCK_DGRAM, getprotobyname("udp")) or die "socket: $!";
proto=getprotobyname("udp");
s=socket(PF_INET, SOCK_DGRAM, proto->p_proto);
if(s==-1) {
//printf("ERROR socket=%d\n",s);
return -1;
}
//Setto il timeout per la ricezione --------------------
struct timeval tv;
tv.tv_sec = TIMEOUT_NTP; //sec
tv.tv_usec = 0;
if (setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(struct timeval)) != 0)
{
//printf("Error assigning socket option");
return -1;
}
memset( &server_addr, 0, sizeof( server_addr ));
server_addr.sin_family=AF_INET;
//trasformo il nome in ip
struct hostent *hp = gethostbyname(hostname);
if (hp == NULL) {
return -1;
} else {
sprintf(hostname_ip, "%s", inet_ntoa( *( struct in_addr*)( hp -> h_addr_list[0])));
}
#ifdef LOG_NTP
printf("%s-->%s PORT %d\n",hostname,hostname_ip,portno);
#endif
server_addr.sin_addr.s_addr = inet_addr(hostname_ip);
server_addr.sin_port=htons(portno);
//printf("ipaddr (in hex): %x\n",server_addr.sin_addr);
/*
* build a message. Our message is all zeros except for a one in the
* protocol version field
* msg[] in binary is 00 001 000 00000000
* it should be a total of 48 bytes long
*/
// send the data
i=sendto(s,msg,sizeof(msg),0,(struct sockaddr *)&server_addr,sizeof(server_addr));
#ifdef LOG_NTP
printf("sendto-->%d\n",i);
#endif
if (i==-1)
return -1;
#ifdef LOG_NTP
printf("prima recv\n");
#endif
// get the data back
i=recv(s,buf,sizeof(buf),0);
#ifdef LOG_NTP
printf("recv-->%d\n",i);
#endif
if (i==-1)
{
#ifdef LOG_NTP
printf("Error: %s (%d)\n", strerror(errno), errno);
#endif
return -1;
}
//printf("recvfr: %d\n",i);
//We get 12 long words back in Network order
//for(i=0;i<12;i++)
//printf("%d\t%-8x\n",i,ntohl(buf[i]));
/*
* The high word of transmit time is the 10th word we get back
* tmit is the time in seconds not accounting for network delays which
* should be way less than a second if this is a local NTP server
*/
tmit=ntohl((time_t)buf[10]); //# get transmit time
#ifdef LOG_NTP
printf("tmit=%d\n",tmit);
#endif
/*
* Convert time to unix standard time NTP is number of seconds since 0000
* UT on 1 January 1900 unix time is seconds since 0000 UT on 1 January
* 1970 There has been a trend to add a 2 leap seconds every 3 years.
* Leap seconds are only an issue the last second of the month in June and
* December if you don't try to set the clock then it can be ignored but
* this is importaint to people who coordinate times with GPS clock sources.
*/
tmit-= 2208988800U;
#ifdef LOG_NTP
printf("tmit=%d\n",tmit);
#endif
/* use unix library function to show me the local time (it takes care
* of timezone issues for both north and south of the equator and places
* that do Summer time/ Daylight savings time.
*/
//#compare to system time
#ifdef LOG_NTP
//printf("%d-->Time: %s\n",tmit,ctime((const time_t)&tmit));
printf("%d-->Time: %s\n",tmit,ctime((const time_t)&tmit));
#endif
//i=time(0);
//printf("%d-%d=%d\n",i,tmit,i-tmit);
//printf("System time is %d seconds off\n",i-tmit);
//Prendo l'ora e la converto in HH:MM:SS --> Sec
strftime(ora, 20, "%T", localtime((const time_t)&tmit));
#ifdef LOG_NTP
printf("%s --> %ld\n",ora, C2TIME(ora));
#endif
return C2TIME(ora);
}
I Solved the Problem!!!!!!!!!
uint32_t buf[maxlen];
uint32_t tmit;
instead of:
unsigned long buf[maxlen];
int tmit;
Defining a variable of type time_t
time_t tmit_temp=tmit;
printf("%d-->Time: %s\n",tmit,ctime((const time_t)&tmit_temp));
strftime(ora, 20, "%T", localtime((const time_t)&tmit_temp));
This works properly!!! ;-)

Where is the global memory replay overhead coming from?

Running the code below to write 1 GB in global memory in the NVIDIA Visual Profiler, I get:
- 100% storage efficiency
- 69.4% (128.6 GB/s) DRAM utilization
- 18.3% total replay overhead
- 18.3% global memory replay overhead.
The memory writes are supposed to be coalesced and there is no divergence in the kernel, so the question is where is the global memory replay overhead coming from? I am running this on Ubuntu 13.04, with nvidia-cuda-toolkit version 5.0.35-4ubuntu1.
#include <cuda.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>
#include <ctype.h>
#include <sched.h>
#include <assert.h>
static void
HandleError( cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
// Global memory writes
__global__ void
kernel_write(uint32_t *start, uint32_t entries)
{
uint32_t tid = threadIdx.x + blockIdx.x*blockDim.x;
while (tid < entries) {
start[tid] = tid;
tid += blockDim.x*gridDim.x;
}
}
int main(int argc, char *argv[])
{
uint32_t *gpu_mem; // Memory pointer
uint32_t n_blocks = 256; // Blocks per grid
uint32_t n_threads = 192; // Threads per block
uint32_t n_bytes = 1073741824; // Transfer size (1 GB)
float elapsedTime; // Elapsed write time
// Allocate 1 GB of memory on the device
HANDLE_ERROR( cudaMalloc((void **)&gpu_mem, n_bytes) );
// Create events
cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate(&start) );
HANDLE_ERROR( cudaEventCreate(&stop) );
// Write to global memory
HANDLE_ERROR( cudaEventRecord(start, 0) );
kernel_write<<<n_blocks, n_threads>>>(gpu_mem, n_bytes/4);
HANDLE_ERROR( cudaGetLastError() );
HANDLE_ERROR( cudaEventRecord(stop, 0) );
HANDLE_ERROR( cudaEventSynchronize(stop) );
HANDLE_ERROR( cudaEventElapsedTime(&elapsedTime, start, stop) );
// Report exchange time
printf("#Delay(ms) BW(GB/s)\n");
printf("%10.6f %10.6f\n", elapsedTime, 1e-6*n_bytes/elapsedTime);
// Destroy events
HANDLE_ERROR( cudaEventDestroy(start) );
HANDLE_ERROR( cudaEventDestroy(stop) );
// Free memory
HANDLE_ERROR( cudaFree(gpu_mem) );
return 0;
}
The nvprof profiler and the API profiler are giving different results:
$ nvprof --events gst_request ./app
======== NVPROF is profiling app...
======== Command: app
#Delay(ms) BW(GB/s)
13.345920 80.454690
======== Profiling result:
Invocations Avg Min Max Event Name
Device 0
Kernel: kernel_write(unsigned int*, unsigned int)
1 8388608 8388608 8388608 gst_request
$ nvprof --events global_store_transaction ./app
======== NVPROF is profiling app...
======== Command: app
#Delay(ms) BW(GB/s)
9.469216 113.392892
======== Profiling result:
Invocations Avg Min Max Event Name
Device 0
Kernel: kernel_write(unsigned int*, unsigned int)
1 8257560 8257560 8257560 global_store_transaction
I had the impression that global_store_transation could not be lower than gst_request. What is going on here? I can't ask for both events in the same command, so I had to run the two separate commands. Could this be the problem?
Strangely, the API profiler shows different results with perfect coalescing. Here is the output, I had to run twice to get the proper counters:
$ cat config.txt
inst_issued
inst_executed
gst_request
$ COMPUTE_PROFILE=1 COMPUTE_PROFILE_CSV=1 COMPUTE_PROFILE_LOG=log.csv COMPUTE_PROFILE_CONFIG=config.txt ./app
$ cat log.csv
# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 580
# CUDA_CONTEXT 1
# CUDA_PROFILE_CSV 1
# TIMESTAMPFACTOR fffff67eaca946b8
method,gputime,cputime,occupancy,inst_issued,inst_executed,gst_request,gld_request
_Z12kernel_writePjj,7771.776,7806.000,1.000,4737053,3900426,557058,0
$ cat config2.txt
global_store_transaction
$ COMPUTE_PROFILE=1 COMPUTE_PROFILE_CSV=1 COMPUTE_PROFILE_LOG=log2.csv COMPUTE_PROFILE_CONFIG=config2.txt ./app
$ cat log2.csv
# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 580
# CUDA_CONTEXT 1
# CUDA_PROFILE_CSV 1
# TIMESTAMPFACTOR fffff67eea92d0e8
method,gputime,cputime,occupancy,global_store_transaction
_Z12kernel_writePjj,7807.584,7831.000,1.000,557058
Here gst_request and global_store_transactions are exactly the same, showing perfect coalescing. Which one is correct (nvprof or the API profiler)? Why does NVIDIA Visual Profiler says that I have non-coalesced writes? There are still significant instruction replays, and I have no idea where they are coming from :(
Any ideas? I don't think this is hardware malfunctioning, since I have two boards on the same machine and both show the same behavior.

Resources