Suspected race condition on scheduling on v0.7
by Sağnak Taşırlar
Hey all,
I was playing with v0.7; I suspect there is a race condition somewhere and I am not sure if it creeped up to v0.8 or not. So here is how[2] to replicate it for those who are interested. Let me also note[1] the configuration of the foobar cluster where I observer this.
Cheers,
Sagnak
[1]
[sagnak@bar27 ocr-x86]$ cat /etc/redhat-release
Fedora release 19 (Schrödinger’s Cat)
[sagnak@bar27 ocr-x86]$ uname -a
Linux bar27 3.9.9-302.fc19.x86_64 #1 SMP Sat Jul 6 13:41:07 UTC 2013 x86_64 x86_64 x86_64 GNU/Linux
[sagnak@bar27 ocr-x86]$ gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-redhat-linux/4.8.1/lto-wrapper
Target: x86_64-redhat-linux
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=http://bugzilla.redhat.com/bugzilla --enable-bootstrap --enable-shared --enable-threads=posix --enable-checking=release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-gnu-unique-object --enable-linker-build-id --with-linker-hash-style=gnu --enable-languages=c,c++,objc,obj-c++,java,fortran,ada,go,lto --enable-plugin --enable-initfini-array --enable-java-awt=gtk --disable-dssi --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-1.5.0.0/jre --enable-libgcj-multifile --enable-java-maintainer-mode --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --disable-libjava-multilib --with-isl=/builddir/build/BUILD/gcc-4.8.1-20130603/obj-x86_64-redhat-linux/isl-install --with-cloog=/builddir/build/BUILD/gcc-4.8.1-20130603/obj-x86_64-redhat-linux/cloog-install --with-tune=generic --with-arch_32=i686 --build=x86_64-redhat-linux
Thread model: posix
gcc version 4.8.1 20130603 (Red Hat 4.8.1-1) (GCC)
// the gcc flags compilation invokes
[sagnak@bar27 ocr-x86]$ cd ./examples/cholesky; make compile; cd -
gcc -O3 -g -L/home/sagnak/xstack/ocr/runtime/ocr-x86//ocr-install/lib -I/home/sagnak/xstack/ocr/runtime/ocr-x86//ocr-install/include -locr -lm -I. cholesky.c -o cholesky.exe
/home/sagnak/xstack/ocr/runtime/ocr-x86
[2]
1) checkout an OCR ~v0.7
git checkout fb012da7fbefc9183891bd4ef70003286aca4e01
git checkout -b some.former.branch
2) Make OCR v0.7 runnable under foobar, below is a patch
index 16dde16..88c6ff9 100644
--- a/ocr/runtime/ocr-x86/src/driver/ocr-config.c
+++ b/ocr/runtime/ocr-x86/src/driver/ocr-config.c
@@ -42,7 +42,7 @@ ocrDataBlockKind ocrDataBlockDefaultKind = OCR_DATABLOCK_REGULAR;
ocrLockKind ocrLockDefaultKind = OCR_LOCK_X86;
ocrGuidProviderKind ocrGuidProviderDefaultKind = OCR_GUIDPROVIDER_PTR;
-u32 ocr_config_default_nb_hardware_threads = 8;
+u32 ocr_config_default_nb_hardware_threads = 16;
// XE kinds of ocr modules
ocr_executor_kind ocr_executor_xe_kind = OCR_EXECUTOR_XE;
index 86225a9..a550324 100644
--- a/ocr/runtime/ocr-x86/src/driver/ocr-driver.c
+++ b/ocr/runtime/ocr-x86/src/driver/ocr-driver.c
@@ -91,7 +91,7 @@ void ocrInit(int * argc, char ** argv, u32 fnc, ocrEdt_t funcs[]) {
globalGuidProvider = newGuidProvider(OCR_GUIDPROVIDER_DEFAULT);
u32 nbHardThreads = ocr_config_default_nb_hardware_threads;
- gHackTotalMemSize = 64*1024*1024; /* 64 MB default */
+ gHackTotalMemSize = 512*1024*1024; /* 64 MB default */
char * md_file = parseOcrOptions_MachineDescription(argc, argv);
/* sagnak begin */
3) build ocr (type install from multiple login sessions till success)
$> rm -Rf compileTree; rm -Rf ocr-install; ./install.sh
4) check out a bug cholesky input
svn co https://svn.rice.edu/r/parsoft/Intel/CnC-X10/examples-input/Cholesky
5) debug cholesky
gdb --args ./cholesky.exe 6000 80 ~/Cholesky/m_06000.in
6) 'r' till you observe
Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7fffd75c9700 (LWP 19699)]
0x00007ffff7df49f5 in hc_task_execute (base=0x13d0f570) at ../../src/scheduler/ocr-scheduler-hc/ocr-edf-hc.c:270
270 derived->depv[i].ptr = db->acquire(db, base->guid, true);
7) observe the task that got prematurely scheduled
(gdb) p *(hc_task_t*)base
$3 = {
base = {
guid = 332461424,
paramc = 5,
params = 0x0,
paramv = 0x13d0f510,
destruct = 0x7ffff7df4e70 <hc_task_destruct>,
iterate_waiting_frontier = 0x7ffff7df48d0 <hc_task_iterate_waiting_frontier>,
execute = 0x7ffff7df4930 <hc_task_execute>,
schedule = 0x7ffff7df4b20 <hc_task_schedule>,
add_dependence = 0x7ffff7df4920 <hc_task_add_dependence>
},
awaitList = 0x13d0f5e0,
nbdeps = 3,
depv = 0x7fffc8072030,
p_function = 0x402da0 <update_nondiagonal_task>
}
8) observe the task's await list, and that the waiting frontier is pointing at the null terminator of the event list
(gdb) p *((hc_task_t*)base)->awaitList
$7 = {
array = 0x13d0f600,
waitingFrontier = 0x13d0f618
}
(gdb) p *((hc_task_t*)base)->awaitList->array
$8 = (ocr_event_t *) 0xc34710
(gdb) p ((hc_task_t*)base)->awaitList->array[0]
$9 = (ocr_event_t *) 0xc34710
(gdb) p ((hc_task_t*)base)->awaitList->array[1]
$10 = (ocr_event_t *) 0xc29bd0
(gdb) p ((hc_task_t*)base)->awaitList->array[2]
$11 = (ocr_event_t *) 0xbeaab0
(gdb) p ((hc_task_t*)base)->awaitList->array[3]
$12 = (ocr_event_t *) 0x0
(gdb) p &((hc_task_t*)base)->awaitList->array[3]
$16 = (ocr_event_t **) 0x13d0f618
9) explore the events that the tasks depends, see one whose data has not been initialized(not put) and its register list not nullified yet
(gdb) p *(hc_event_t*)(((hc_task_t*)base)->awaitList->array[0])
$19 = {
base = {
guid = 12797712,
destruct = 0x7ffff7df4ab0 <hc_event_destructor>,
get = 0x7ffff7df48b0 <hc_event_get>,
put = 0x7ffff7df4d70 <hc_event_put>,
registerIfNotReady = 0x7ffff7df4ad0 <hc_event_register_if_not_ready>
},
datum = -2,
register_list = 0x13d0f630
}
10) observe schrodinger's event that is satisfied and unsatisfied at the same time
(gdb) p *(((hc_event_t*)(((hc_task_t*)base)->awaitList->array[0]))->register_list)
$24 = {
task_guid = 332461424,
next = 0xffffffffffffffff
}
11) observe the task registered as a consumer waiting for that event to be satisfied is the task that got scheduled prematurely
(gdb) p (hc_task_t*)(((hc_event_t*)(((hc_task_t*)base)->awaitList->array[0]))->register_list)->task_guid
$26 = (hc_task_t *) 0x13d0f570
(gdb) p (hc_task_t*)base
$27 = (hc_task_t *) 0x13d0f570
8 years, 8 months