#include "chess.h"
#include "data.h"
#include "epdglue.h"
/* modified 04/30/14 */
/*
*******************************************************************************
* *
* Thread() is the driver for the threaded parallel search in Crafty. The *
* basic idea is that whenever we notice that one (or more) threads are in *
* their idle loop, we drop into thread, from Search(), and begin a new *
* parallel search from this node. This is simply a problem of copying the *
* search state space for each thread working at this node, then sending *
* everyone off to SearchParallel() to search this node in parallel. *
* *
* There are a number of settable options via the command-line or .craftyrc *
* initialization file. Here's a concise explanation for each option and an *
* occasional suggestion for testing/tuning. *
* *
* smp_max_threads (command = smpmt=n) sets the total number of allowable *
* threads for the search. The default is one (1) as Crafty does not *
* assume it should use all available resources. For optimal performance *
* this should be set to the number of physical cores your machine has, *
* which does NOT include hyperthreaded cores. *
* *
* smp_split_group (command = smpgroup=n) sets the maximum number of threads *
* at any single split point, with the exception of split points where *
* ply <= 4 where ALL threads are allowed to split together ignoring this *
* limit. *
* *
* smp_min_split_depth (command = smpmin=n) avoids splitting when remaining *
* depth < n. This is used to balance splitting overhead cost against *
* the speed gain the parallel search produes. The default is currently *
* 5 (which could change with future generations of Intel hardware) but *
* values between 4 and 8 will work. Larger values allow somewhat fewer *
* splits, which reduces overhead, but it also increases the percentage *
* of the time where a thread is waiting on work. *
* *
* smp_split_nodes (command = smpnodes=n) will not allow a thread to split *
* until it has searched at least N nodes, to prevent excessive splitting *
* out near the tips in endgames. The default is 2000. Larger values *
* will reduce splitting overhead but will increase the amoutn of time *
* a thread is waiting on work. *
* *
* smp_split_at_root (command=smproot=0 or 1) enables (1) or disables (0) *
* splitting the tree at the root. This defaults to 1 which produces the *
* best performance by a signficiant margin. But it can be disabled if *
* you are playing with code changes. *
* *
* The best way to tune any of these parameters is to run SEVERAL test cases *
* (positions) with max threads set. Run each set of positions several *
* times with each parameter change you want to try (do them ONE at a time *
* for obvious reasons). Pick the value with the smallest overall search *
* time. The more cores you use, the more times you should run each test, *
* since parallel search is highly non-deterministic and you need several *
* runs to get a reasonable average. *
* *
* A few basic "rules of the road" for anyone interested in changing or *
* adding to any of this code. *
* *
* 1. If, at any time, you want to modify your private split block, no lock *
* is required. *
* *
* 2. If, at any time, you want to modify another split block, such as the *
* parent split block shared move list, you must acquire the lock in the *
* split block first. IE (tree->parent->lock to lock the parent split *
* block during NextMove() and such). Note that this ONLY applies to *
* search-related data, NOT the SMP data within the split block (parent *
* pointer, sibling pointers, number of processors working here, etc). *
* Modifying those falls under the next lock below. *
* *
* 3. If you want to modify any SMP-related data, such as allocating a *
* split block, or telling sibling processes to stop, etc, then you must *
* acquire the global "smp_lock" lock first. This prevents any sort of *
* race condition that could corrupt the split block tree organization. *
* This applies to ANY smp-related variable from the simple smp_idle *
* variable through the various sibling chains and such. *
* *
* 4. If you want to do any sort of I/O operation, you must acquire the *
* "lock_io" lock first. Since threads share descriptors, there are *
* lots of potential race conditions, from the simple tty-output getting *
* interlaced from different threads, to outright data corruption in the *
* book or log files. *
* *
* 5. There is a specific "lock_split" that is used to prevent a lot of *
* queued waiting to do splits, but it is unlikely that lock would ever *
* be used anywhere other than where it is currently accessed in *
* Search() at the parallel split initiation code. *
* *
* 6. If you want to alter the root move list, you must first acquire *
* lock_root, since the root move list is shared and multiple threads *
* can attempt to modify it at the same time. Overlooking this can *
* result in a corrupted root move list with one or more moves missing *
* or duplicated. *
* *
* Some of the bugs caused by failing to acquire the correct lock will only *
* occur infrequently, and they are extremely difficult to find. Some only *
* show up in a public game where everyone is watching, to cause maximum *
* embarassment and causes the program to do something extremely stupid. *
* *
*******************************************************************************
*/
int Thread(TREE * RESTRICT tree) {
int proc, tsplit = 0, nblocks = 0;
/*
************************************************************
* *
* First, we make sure that there are idle threads. It is *
* possible that we get here after they have all been *
* claimed by another thread. In that case we return. *
* *
************************************************************
*/
Lock(lock_smp);
for (proc = 0; proc < smp_max_threads && !thread[proc].idle; proc++);
if (proc == smp_max_threads) {
smp_split = 0;
Unlock(lock_smp);
return 0;
}
/*
************************************************************
* *
* Now we start copying the current "state" from this *
* thread into new thread blocks so that the threads can *
* search this position without interfering with each *
* other. As we copy, we link those blocks as siblings of *
* the parent node, and point each sibling back to the *
* parent so we can unwind this confusion as the threads *
* complete their work. *
* *
* CopyToChild() allocates a split block optimally (see *
* that code for an explanation) and then copies anything *
* necessary for the parallel search. *
* *
* Special case 1: In the loop to allocate a split block *
* and copy current data to the new child thread, we skip *
* over the current splitting thread's data. After the *
* loop we then explicitly allocate a block for this *
* thread so that it will be included in the thread group *
* (remember that this thread is the one that has to back *
* up through the split block, so it will always call *
* ThreadWait() telling it to do so. There was a peculiar *
* condition that caused an inefficiency. If we are at *
* a depth where we honor the smp_split_group limit, it is *
* possible that on occasion, there could be so many idle *
* processors that we would not include ourself. For *
* example, suppose we are thread 6, and when we enter *
* Thread() to do the split, threads 0-5 are idle, and *
* smp_split_group = 6. We would pick up the first 6 idle *
* threads and sic them on this split point, but we would *
* fail to include ourself. We would hit ThreadWait() *
* with no work to do, and then split somewhere else. *
* Unfortunately, THIS thread is the one that needs to *
* wait for all the other threads to complete so that it *
* can back up to the previous ply. But if it is busy on *
* another split point, it won't be able to return to this *
* point until it is idle. Which means progress at this *
* split point is held up. We now forcefully include the *
* current thread, and only include smp_split_group - 1 *
* other threads to work here with this thread. *
* *
* Special case 2: it is possible that there are no split *
* blocks available. In that case, CopyToChild() will *
* terminate Crafty with a log-file error message telling *
* the user to re-compile with a larger number of split *
* blocks (the probability of this actually happening is *
* incredibly small, but it is non-zero. We use a VERY *
* liberal estimate for the number of split blocks to *
* allocate to avoid this.) *
* *
************************************************************
*/
thread[tree->thread_id].tree = 0;
tree->nprocs = 0;
for (proc = 0; proc < smp_max_threads; proc++) {
tree->siblings[proc] = 0;
if (thread[proc].idle) {
CopyToChild(tree, proc);
nblocks++;
if (nblocks >= smp_split_group && tree->ply > tree->depth / 2)
break;
}
}
CopyToChild(tree, tree->thread_id);
nblocks++;
/*
************************************************************
* *
* Everything is set. Now we can stuff the address of the *
* thread blocks into thread[i].tree so that those idle *
* threads can begin the parallel search. We also flag *
* them as "not idle" so that a split immediately after we *
* exit won't try to pick them up again, breaking things. *
* *
************************************************************
*/
parallel_splits++;
for (proc = 0; proc < smp_max_threads; proc++)
if (tree->siblings[proc]) {
thread[proc].tree = tree->siblings[proc];
thread[proc].idle = 0;
}
/*
************************************************************
* *
* One final test before leaving. We test all threads to *
* determine if any are still idle (most likely due to the *
* smp_split_group limit, but a thread could go idle after *
* our loop to pick up idle threads). If so we set split *
* back to one (actually a temporary split variable since *
* we don't want to remove the split=-1 state until just *
* before we exit to avoid any races.) *
* *
************************************************************
*/
for (proc = 0; proc < smp_max_threads; proc++)
if (thread[proc].idle)
tsplit = 1;
/*
************************************************************
* *
* After the threads are kicked off to start the parallel *
* search using the idle threads, this thread has to be *
* inserted as well. However, since it is possible that *
* this thread may finish before any or all of the other *
* parallel threads, this thread is sent to ThreadWait() *
* which will immediately send it to SearchParallel() like *
* the other threads. Going to ThreadWait() allows this *
* thread to join others if it runs out of work to do. We *
* do pass ThreadWait() the address of the parent thread *
* block, so that if this thread becomes idle, and this *
* thread block shows no threads are still busy, then this *
* thread can return to here and then back up into the *
* previous ply as it should. Note that no other thread *
* can back up to the previous ply since their recursive *
* call stacks are not set for that. *
* *
************************************************************
*/
smp_split = tsplit;
Unlock(lock_smp);
ThreadWait(tree->thread_id, tree);
return 1;
}
/* modified 04/30/14 */
/*
*******************************************************************************
* *
* CopyToChild() is used to copy data from a parent thread to a particular *
* child thread. This only copies the appropriate parts of the TREE *
* structure to avoid burning memory bandwidth by copying everything. *
* *
*******************************************************************************
*/
void CopyToChild(TREE * RESTRICT parent, int thread) {
int i, j, /*max, */ply = parent->ply;
unsigned int max_; // Pierre-Marie Baty -- should be unsigned
TREE *child;
static int warnings = 0;
int first = thread * MAX_BLOCKS_PER_CPU + 1;
int last = first + MAX_BLOCKS_PER_CPU;
int maxb = smp_max_threads * MAX_BLOCKS_PER_CPU + 1;
/*
************************************************************
* *
* One NUMA-related trick is that we first try to allocate *
* a split block in the thread's local memory. Each *
* thread has a group of split blocks that were first *
* touched by the correct CPU so that the split blocks *
* page faulted into local memory for that specific *
* processor. If we can't find an optimal-placed block, *
* the second pass will find the first available block. *
* If none can be found, we return as we can not split *
* until other split blocks are freed up. *
* *
************************************************************
*/
for (i = first; i < last && block[i]->used; i++);
if (i >= last) {
if (++warnings < 6)
Print(128,
"WARNING. optimal SMP block cannot be allocated, thread %d\n",
thread);
for (i = 1; i < maxb && block[i]->used; i++);
if (i >= maxb) {
Print(128, "ERROR. no SMP block can be allocated\n");
}
}
max_ = 0;
for (j = 1; j < maxb; j++)
if (block[j]->used)
max_++;
max_split_blocks = Max(max_split_blocks, max_);
/*
************************************************************
* *
* We have allocated a split block. Now we copy the tree *
* search state from the parent block to the child in *
* preparation for starting the parallel search. *
* *
************************************************************
*/
child = block[i];
child->used = 1;
child->stop = 0;
for (i = 0; i < smp_max_threads; i++)
child->siblings[i] = 0;
child->nprocs = 0;
child->ply = ply;
child->position = parent->position;
child->pv[ply - 1] = parent->pv[ply - 1];
child->pv[ply] = parent->pv[ply];
child->next_status[ply] = parent->next_status[ply];
child->save_hash_key[ply] = parent->save_hash_key[ply];
child->save_pawn_hash_key[ply] = parent->save_pawn_hash_key[ply];
child->rep_index = parent->rep_index;
for (i = 0; i <= parent->rep_index + parent->ply; i++)
child->rep_list[i] = parent->rep_list[i];
child->last[ply] = child->move_list;
child->hash_move[ply] = parent->hash_move[ply];
for (i = 1; i <= ply + 1; i++) {
child->status[i] = parent->status[i];
child->curmv[i] = parent->curmv[i];
child->inchk[i] = parent->inchk[i];
child->phase[i] = parent->phase[i];
}
for (i = 1; i < MAXPLY; i++)
child->killers[i] = parent->killers[i];
child->nodes_searched = 0;
child->fail_highs = 0;
child->fail_high_first_move = 0;
child->evaluations = 0;
child->egtb_probes = 0;
child->egtb_probes_successful = 0;
child->extensions_done = 0;
child->qchecks_done = 0;
child->reductions_done = 0;
child->moves_fpruned = 0;
child->alpha = parent->alpha;
child->beta = parent->beta;
child->value = parent->value;
child->side = parent->side;
child->depth = parent->depth;
parent->siblings[thread] = child;
child->thread_id = thread;
child->parent = parent;
parent->nprocs++;
strcpy_s(child->root_move_text, sizeof (child->root_move_text), parent->root_move_text); // Pierre-Marie Baty -- use safe version
strcpy_s(child->remaining_moves_text, sizeof (child->remaining_moves_text), parent->remaining_moves_text); // Pierre-Marie Baty -- use safe version
}
/* modified 04/18/14 */
/*
*******************************************************************************
* *
* CopyToParent() is used to copy data from a child thread to a parent *
* thread. This only copies the appropriate parts of the TREE structure to *
* avoid burning memory bandwidth by copying everything. *
* *
*******************************************************************************
*/
void CopyToParent(TREE * RESTRICT parent, TREE * RESTRICT child, int value) {
int ply = parent->ply;
/*
************************************************************
* *
* Only concern here is to make sure that the info is only *
* copied to the parent if our score is > than the parent *
* value, and that we were not stopped for any reason *
* which could produce a partial score that is worthless. *
* *
* In any case, we add our statistical counters to the *
* parent's totals no matter whether we finished or not *
* since the total nodes searched and such should consider *
* everything searched, not just the "useful stuff." *
* *
************************************************************
*/
if (child->nodes_searched && !child->stop && value > parent->value &&
!abort_search) {
parent->pv[ply] = child->pv[ply];
parent->value = value;
parent->cutmove = child->curmv[ply];
}
parent->nodes_searched += child->nodes_searched;
parent->fail_highs += child->fail_highs;
parent->fail_high_first_move += child->fail_high_first_move;
parent->evaluations += child->evaluations;
parent->egtb_probes += child->egtb_probes;
parent->egtb_probes_successful += child->egtb_probes_successful;
parent->extensions_done += child->extensions_done;
parent->qchecks_done += child->qchecks_done;
parent->reductions_done += child->reductions_done;
parent->moves_fpruned += child->moves_fpruned;
child->used = 0;
}
/*
*******************************************************************************
* *
* WaitForAllThreadsInitialized() waits until all smp_max_threads are *
* initialized. We have to initialize each thread and malloc() its split *
* blocks before we start the actual parallel search. Otherwise we will see *
* invalid memory accesses and crash instantly. *
* *
*******************************************************************************
*/
void WaitForAllThreadsInitialized(void) {
while (initialized_threads < smp_max_threads); /* Do nothing */
}
/* modified 06/07/09 */
/*
*******************************************************************************
* *
* ThreadInit() is called after a process is created. Its main task is to *
* initialize the process local memory so that it will fault in and be *
* allocated on the local node rather than the node where the original *
* (first) process was running. All threads will hang here via a custom *
* WaitForALlThreadsInitialized() procedure so that all the local thread *
* blocks are usable before the search actually begins. *
* *
*******************************************************************************
*/
void *STDCALL ThreadInit(void *tid) {
int i, j;
#if defined(AFFINITY)
int64_t k;
cpu_set_t cpuset;
pthread_t current_thread = pthread_self();
k = (int64_t) tid;
CPU_ZERO(&cpuset);
CPU_SET(k, &cpuset);
pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
#endif
#if !defined(UNIX)
ThreadMalloc((uint64_t) tid);
#endif
for (i = 0; i < MAX_BLOCKS_PER_CPU; i++) {
memset((void *) block
[(uint64_t) tid
* MAX_BLOCKS_PER_CPU
+ i
+ 1], 0,
sizeof(TREE));
block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->used = 0;
block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->parent = NULL;
LockInit(block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->lock);
for (j = 0; j < 64; j++)
block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->cache_n[j] = ~0ull;
}
Lock(lock_smp);
initialized_threads++;
Unlock(lock_smp);
WaitForAllThreadsInitialized();
ThreadWait((uint64_t) tid, (TREE *) 0);
Lock(lock_smp);
smp_threads--;
Unlock(lock_smp);
return 0;
}
#if !defined (UNIX)
/* modified 01/17/09 */
/*
*******************************************************************************
* *
* ThreadMalloc() is called from the ThreadInit() function. It malloc's the *
* split blocks in the local memory for the processor associated with the *
* specific thread that is calling this code. *
* *
*******************************************************************************
*/
extern void *WinMalloc(size_t, int);
void ThreadMalloc(int64_t tid) {
int i, n = MAX_BLOCKS_PER_CPU;
for (i = MAX_BLOCKS_PER_CPU * (int) (tid) + 1; n; i++, n--) { // Pierre-Marie Baty -- added type cast
if (block[i] == NULL)
block[i] =
(TREE *) ((~(size_t) 127) & (127 + (size_t) WinMalloc(sizeof(TREE) +
127, (int) tid))); // Pierre-Marie Baty -- added type cast
block[i]->used = 0;
block[i]->parent = NULL;
LockInit(block[i]->lock);
}
}
#endif
/* modified 01/17/09 */
/*
*******************************************************************************
* *
* ThreadStop() is called from SearchParallel() when it detects a beta *
* cutoff (fail high) at a node that is being searched in parallel. We need *
* to stop all threads here, and since this threading algorithm is recursive *
* it may be necessary to stop other threads that are helping search this *
* branch further down into the tree. This function simply sets tree->stop *
* to 1, which will stop that particular thread instantly and return it to *
* the idle loop in ThreadWait(). *
* *
*******************************************************************************
*/
void ThreadStop(TREE * RESTRICT tree) {
int proc;
Lock(tree->lock);
tree->stop = 1;
for (proc = 0; proc < smp_max_threads; proc++)
if (tree->siblings[proc])
ThreadStop(tree->siblings[proc]);
Unlock(tree->lock);
}
/* modified 04/23/14 */
/*
*******************************************************************************
* *
* ThreadWait() is the idle loop for the N threads that are created at the *
* beginning when Crafty searches. Threads are "parked" here waiting on a *
* pointer to something they should search (a parameter block built in the *
* function Thread() in this case. When this pointer becomes non-zero, each *
* thread "parked" here will immediately call SearchParallel() and begin the *
* parallel search as directed. *
* *
*******************************************************************************
*/
int ThreadWait(int64_t tid, TREE * RESTRICT waiting) {
int value, tstart, tend;
/*
************************************************************
* *
* The N initial threads enter here and are kept penned *
* here forever. However, once a thread leaves here it *
* may well re-enter ThreadWait() from the top while it *
* waits for a parallel search to complete. While it *
* waits here it can also join in to help other busy *
* threads search their subtrees as well. *
* *
************************************************************
*/
while (1) {
tstart = ReadClock();
Lock(lock_split);
Lock(lock_smp);
smp_idle++;
if (!waiting && smp_split == 0)
smp_split = 1;
if (!thread[tid].tree)
thread[tid].idle = 1;
Unlock(lock_smp);
Unlock(lock_split);
/*
************************************************************
* *
* We can exit if our thread[i] is non-zero, or if we are *
* waiting on others to finish a block that *we* have to *
* return through. When the busy count on such a block *
* hits zero, we return immediately which unwinds the *
* search as it should be. *
* *
************************************************************
*/
while (!thread[tid].tree && (!waiting || waiting->nprocs))
Pause();
thread[tid].idle = 0;
tend = ReadClock();
Lock(lock_smp);
idle_time += tend - tstart;
if (!thread[tid].tree)
thread[tid].tree = waiting;
/*
************************************************************
* *
* We either have work to do, or threads we were waiting *
* on have finished their work. *
* *
************************************************************
*/
smp_idle--;
Unlock(lock_smp);
/*
************************************************************
* *
* If we are waiting on a block and the busy count is now *
* zero, we simply return to finish up the bookkeeping at *
* that point. *
* *
************************************************************
*/
if (thread[tid].tree == waiting || thread[tid].tree == (TREE *) - 1)
return 0;
/*
************************************************************
* *
* Else our thread[i] pointer is non-zero, meaning we have *
* been assigned something to search. Hi-ho, hi-ho, it's *
* off to work we go... *
* *
************************************************************
*/
value =
SearchParallel(thread[tid].tree, thread[tid].tree->alpha,
thread[tid].tree->beta, thread[tid].tree->value,
thread[tid].tree->side, thread[tid].tree->depth,
thread[tid].tree->ply);
Lock(thread[tid].tree->parent->lock);
CopyToParent((TREE *) thread[tid].tree->parent, thread[tid].tree, value);
thread[tid].tree->parent->nprocs--;
thread[tid].tree->parent->siblings[tid] = 0;
Unlock(thread[tid].tree->parent->lock);
thread[tid].tree = 0;
}
}