#include "chess.h"
 
#include "data.h"
 
#include "epdglue.h"
 
/* modified 04/30/14 */
 
/*
 
 *******************************************************************************
 
 *                                                                             *
 
 *   Thread() is the driver for the threaded parallel search in Crafty.  The   *
 
 *   basic idea is that whenever we notice that one (or more) threads are in   *
 
 *   their idle loop, we drop into thread, from Search(), and begin a new      *
 
 *   parallel search from this node.  This is simply a problem of copying the  *
 
 *   search state space for each thread working at this node, then sending     *
 
 *   everyone off to SearchParallel() to search this node in parallel.         *
 
 *                                                                             *
 
 *   There are a number of settable options via the command-line or .craftyrc  *
 
 *   initialization file.  Here's a concise explanation for each option and an *
 
 *   occasional suggestion for testing/tuning.                                 *
 
 *                                                                             *
 
 *   smp_max_threads (command = smpmt=n) sets the total number of allowable    *
 
 *      threads for the search.  The default is one (1) as Crafty does not     *
 
 *      assume it should use all available resources.  For optimal performance *
 
 *      this should be set to the number of physical cores your machine has,   *
 
 *      which does NOT include hyperthreaded cores.                            *
 
 *                                                                             *
 
 *   smp_split_group (command = smpgroup=n) sets the maximum number of threads *
 
 *      at any single split point, with the exception of split points where    *
 
 *      ply <= 4 where ALL threads are allowed to split together ignoring this *
 
 *      limit.                                                                 *
 
 *                                                                             *
 
 *   smp_min_split_depth (command = smpmin=n) avoids splitting when remaining  *
 
 *      depth < n.  This is used to balance splitting overhead cost against    *
 
 *      the speed gain the parallel search produes.  The default is currently  *
 
 *      5 (which could change with future generations of Intel hardware) but   *
 
 *      values between 4 and 8 will work.  Larger values allow somewhat fewer  *
 
 *      splits, which reduces overhead, but it also increases the percentage   *
 
 *      of the time where a thread is waiting on work.                         *
 
 *                                                                             *
 
 *   smp_split_nodes (command = smpnodes=n) will not allow a thread to split   *
 
 *      until it has searched at least N nodes, to prevent excessive splitting *
 
 *      out near the tips in endgames.  The default is 2000.  Larger values    *
 
 *      will reduce splitting overhead but will increase the amoutn of time    *
 
 *      a thread is waiting on work.                                           *
 
 *                                                                             *
 
 *   smp_split_at_root (command=smproot=0 or 1) enables (1) or disables (0)    *
 
 *      splitting the tree at the root.  This defaults to 1 which produces the *
 
 *      best performance by a signficiant margin.  But it can be disabled if   *
 
 *      you are playing with code changes.                                     *
 
 *                                                                             *
 
 *   The best way to tune any of these parameters is to run SEVERAL test cases *
 
 *   (positions) with max threads set.  Run each set of positions several      *
 
 *   times with each parameter change you want to try (do them ONE at a time   *
 
 *   for obvious reasons).  Pick the value with the smallest overall search    *
 
 *   time.  The more cores you use, the more times you should run each test,   *
 
 *   since parallel search is highly non-deterministic and you need several    *
 
 *   runs to get a reasonable average.                                         *
 
 *                                                                             *
 
 *   A few basic "rules of the road" for anyone interested in changing or      *
 
 *   adding to any of this code.                                               *
 
 *                                                                             *
 
 *   1.  If, at any time, you want to modify your private split block, no lock *
 
 *       is required.                                                          *
 
 *                                                                             *
 
 *   2.  If, at any time, you want to modify another split block, such as the  *
 
 *       parent split block shared move list, you must acquire the lock in the *
 
 *       split block first.  IE (tree->parent->lock to lock the parent split   *
 
 *       block during NextMove() and such).  Note that this ONLY applies to    *
 
 *       search-related data, NOT the SMP data within the split block (parent  *
 
 *       pointer, sibling pointers, number of processors working here, etc).   *
 
 *       Modifying those falls under the next lock below.                      *
 
 *                                                                             *
 
 *   3.  If you want to modify any SMP-related data, such as allocating a      *
 
 *       split block, or telling sibling processes to stop, etc, then you must *
 
 *       acquire the global "smp_lock" lock first.  This prevents any sort of  *
 
 *       race condition that could corrupt the split block tree organization.  *
 
 *       This applies to ANY smp-related variable from the simple smp_idle     *
 
 *       variable through the various sibling chains and such.                 *
 
 *                                                                             *
 
 *   4.  If you want to do any sort of I/O operation, you must acquire the     *
 
 *       "lock_io" lock first.  Since threads share descriptors, there are     *
 
 *       lots of potential race conditions, from the simple tty-output getting *
 
 *       interlaced from different threads, to outright data corruption in the *
 
 *       book or log files.                                                    *
 
 *                                                                             *
 
 *   5.  There is a specific "lock_split" that is used to prevent a lot of     *
 
 *       queued waiting to do splits, but it is unlikely that lock would ever  *
 
 *       be used anywhere other than where it is currently accessed in         *
 
 *       Search() at the parallel split initiation code.                       *
 
 *                                                                             *
 
 *   6.  If you want to alter the root move list, you must first acquire       *
 
 *       lock_root, since the root move list is shared and multiple threads    *
 
 *       can attempt to modify it at the same time.  Overlooking this can      *
 
 *       result in a corrupted root move list with one or more moves missing   *
 
 *       or duplicated.                                                        *
 
 *                                                                             *
 
 *   Some of the bugs caused by failing to acquire the correct lock will only  *
 
 *   occur infrequently, and they are extremely difficult to find.  Some only  *
 
 *   show up in a public game where everyone is watching, to cause maximum     *
 
 *   embarassment and causes the program to do something extremely stupid.     *
 
 *                                                                             *
 
 *******************************************************************************
 
 */
 
int Thread(TREE * RESTRICT tree) {
 
  int proc, tsplit = 0, nblocks = 0;
 
 
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  First, we make sure that there are idle threads. It is  *
 
 *  possible that we get here after they have all been      *
 
 *  claimed by another thread.  In that case we return.     *
 
 *                                                          *
 
 ************************************************************
 
 */
 
  Lock(lock_smp);
 
  for (proc = 0; proc < smp_max_threads && !thread[proc].idle; proc++);
 
  if (proc == smp_max_threads) {
 
    smp_split = 0;
 
    Unlock(lock_smp);
 
    return 0;
 
  }
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  Now we start copying the current "state" from this      *
 
 *  thread into new thread blocks so that the threads can   *
 
 *  search this position without interfering with each      *
 
 *  other.  As we copy, we link those blocks as siblings of *
 
 *  the parent node, and point each sibling back to the     *
 
 *  parent so we can unwind this confusion as the threads   *
 
 *  complete their work.                                    *
 
 *                                                          *
 
 *  CopyToChild() allocates a split block optimally (see    *
 
 *  that code for an explanation) and then copies anything  *
 
 *  necessary for the parallel search.                      *
 
 *                                                          *
 
 *  Special case 1:  In the loop to allocate a split block  *
 
 *  and copy current data to the new child thread, we skip  *
 
 *  over the current splitting thread's data.  After the    *
 
 *  loop we then explicitly allocate a block for this       *
 
 *  thread so that it will be included in the thread group  *
 
 *  (remember that this thread is the one that has to back  *
 
 *  up through the split block, so it will always call      *
 
 *  ThreadWait() telling it to do so.  There was a peculiar *
 
 *  condition that caused an inefficiency.  If we are at    *
 
 *  a depth where we honor the smp_split_group limit, it is *
 
 *  possible that on occasion, there could be so many idle  *
 
 *  processors that we would not include ourself.  For      *
 
 *  example, suppose we are thread 6, and when we enter     *
 
 *  Thread() to do the split, threads 0-5 are idle, and     *
 
 *  smp_split_group = 6.  We would pick up the first 6 idle *
 
 *  threads and sic them on this split point, but we would  *
 
 *  fail to include ourself.  We would hit ThreadWait()     *
 
 *  with no work to do, and then split somewhere else.      *
 
 *  Unfortunately, THIS thread is the one that needs to     *
 
 *  wait for all the other threads to complete so that it   *
 
 *  can back up to the previous ply.  But if it is busy on  *
 
 *  another split point, it won't be able to return to this *
 
 *  point until it is idle.  Which means progress at this   *
 
 *  split point is held up.  We now forcefully include the  *
 
 *  current thread, and only include smp_split_group - 1    *
 
 *  other threads to work here with this thread.            *
 
 *                                                          *
 
 *  Special case 2:  it is possible that there are no split *
 
 *  blocks available.  In that case, CopyToChild() will     *
 
 *  terminate Crafty with a log-file error message telling  *
 
 *  the user to re-compile with a larger number of split    *
 
 *  blocks (the probability of this actually happening is   *
 
 *  incredibly small, but it is non-zero.  We use a VERY    *
 
 *  liberal estimate for the number of split blocks to      *
 
 *  allocate to avoid this.)                                *
 
 *                                                          *
 
 ************************************************************
 
 */
 
  thread[tree->thread_id].tree = 0;
 
  tree->nprocs = 0;
 
  for (proc = 0; proc < smp_max_threads; proc++) {
 
    tree->siblings[proc] = 0;
 
    if (thread[proc].idle) {
 
      CopyToChild(tree, proc);
 
      nblocks++;
 
      if (nblocks >= smp_split_group && tree->ply > tree->depth / 2)
 
        break;
 
    }
 
  }
 
  CopyToChild(tree, tree->thread_id);
 
  nblocks++;
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  Everything is set.  Now we can stuff the address of the *
 
 *  thread blocks into thread[i].tree so that those idle    *
 
 *  threads can begin the parallel search.  We also flag    *
 
 *  them as "not idle" so that a split immediately after we *
 
 *  exit won't try to pick them up again, breaking things.  *
 
 *                                                          *
 
 ************************************************************
 
 */
 
  parallel_splits++;
 
  for (proc = 0; proc < smp_max_threads; proc++)
 
    if (tree->siblings[proc]) {
 
      thread[proc].tree = tree->siblings[proc];
 
      thread[proc].idle = 0;
 
    }
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  One final test before leaving.  We test all threads to  *
 
 *  determine if any are still idle (most likely due to the *
 
 *  smp_split_group limit, but a thread could go idle after *
 
 *  our loop to pick up idle threads).  If so we set split  *
 
 *  back to one (actually a temporary split variable since  *
 
 *  we don't want to remove the split=-1 state until just   *
 
 *  before we exit to avoid any races.)                     *
 
 *                                                          *
 
 ************************************************************
 
 */
 
  for (proc = 0; proc < smp_max_threads; proc++)
 
    if (thread[proc].idle)
 
      tsplit = 1;
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  After the threads are kicked off to start the parallel  *
 
 *  search using the idle threads, this thread has to be    *
 
 *  inserted as well.  However, since it is possible that   *
 
 *  this thread may finish before any or all of the other   *
 
 *  parallel threads, this thread is sent to ThreadWait()   *
 
 *  which will immediately send it to SearchParallel() like *
 
 *  the other threads.  Going to ThreadWait() allows this   *
 
 *  thread to join others if it runs out of work to do.  We *
 
 *  do pass ThreadWait() the address of the parent thread   *
 
 *  block, so that if this thread becomes idle, and this    *
 
 *  thread block shows no threads are still busy, then this *
 
 *  thread can return to here and then back up into the     *
 
 *  previous ply as it should.  Note that no other thread   *
 
 *  can back up to the previous ply since their recursive   *
 
 *  call stacks are not set for that.                       *
 
 *                                                          *
 
 ************************************************************
 
 */
 
  smp_split = tsplit;
 
  Unlock(lock_smp);
 
  ThreadWait(tree->thread_id, tree);
 
  return 1;
 
}
 
 
 
/* modified 04/30/14 */
 
/*
 
 *******************************************************************************
 
 *                                                                             *
 
 *   CopyToChild() is used to copy data from a parent thread to a particular   *
 
 *   child thread.  This only copies the appropriate parts of the TREE         *
 
 *   structure to avoid burning memory bandwidth by copying everything.        *
 
 *                                                                             *
 
 *******************************************************************************
 
 */
 
void CopyToChild(TREE * RESTRICT parent, int thread) {
 
  int i, j, /*max, */ply = parent->ply;
 
  unsigned int max_; // Pierre-Marie Baty -- should be unsigned
 
  TREE *child;
 
  static int warnings = 0;
 
  int first = thread * MAX_BLOCKS_PER_CPU + 1;
 
  int last = first + MAX_BLOCKS_PER_CPU;
 
  int maxb = smp_max_threads * MAX_BLOCKS_PER_CPU + 1;
 
 
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  One NUMA-related trick is that we first try to allocate *
 
 *  a split block in the thread's local memory.  Each       *
 
 *  thread has a group of split blocks that were first      *
 
 *  touched by the correct CPU so that the split blocks     *
 
 *  page faulted into local memory for that specific        *
 
 *  processor.  If we can't find an optimal-placed block,   *
 
 *  the second pass will find the first available block.    *
 
 *  If none can be found, we return as we can not split     *
 
 *  until other split blocks are freed up.                  *
 
 *                                                          *
 
 ************************************************************
 
 */
 
  for (i = first; i < last && block[i]->used; i++);
 
  if (i >= last) {
 
    if (++warnings < 6)
 
      Print(128,
 
          "WARNING.  optimal SMP block cannot be allocated, thread %d\n",
 
          thread);
 
    for (i = 1; i < maxb && block[i]->used; i++);
 
    if (i >= maxb) {
 
      Print(128, "ERROR.  no SMP block can be allocated\n");
 
    }
 
  }
 
  max_ = 0;
 
  for (j = 1; j < maxb; j++)
 
    if (block[j]->used)
 
      max_++;
 
  max_split_blocks = Max(max_split_blocks, max_);
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  We have allocated a split block.  Now we copy the tree  *
 
 *  search state from the parent block to the child in      *
 
 *  preparation for starting the parallel search.           *
 
 *                                                          *
 
 ************************************************************
 
 */
 
  child = block[i];
 
  child->used = 1;
 
  child->stop = 0;
 
  for (i = 0; i < smp_max_threads; i++)
 
    child->siblings[i] = 0;
 
  child->nprocs = 0;
 
  child->ply = ply;
 
  child->position = parent->position;
 
  child->pv[ply - 1] = parent->pv[ply - 1];
 
  child->pv[ply] = parent->pv[ply];
 
  child->next_status[ply] = parent->next_status[ply];
 
  child->save_hash_key[ply] = parent->save_hash_key[ply];
 
  child->save_pawn_hash_key[ply] = parent->save_pawn_hash_key[ply];
 
  child->rep_index = parent->rep_index;
 
  for (i = 0; i <= parent->rep_index + parent->ply; i++)
 
    child->rep_list[i] = parent->rep_list[i];
 
  child->last[ply] = child->move_list;
 
  child->hash_move[ply] = parent->hash_move[ply];
 
  for (i = 1; i <= ply + 1; i++) {
 
    child->status[i] = parent->status[i];
 
    child->curmv[i] = parent->curmv[i];
 
    child->inchk[i] = parent->inchk[i];
 
    child->phase[i] = parent->phase[i];
 
  }
 
  for (i = 1; i < MAXPLY; i++)
 
    child->killers[i] = parent->killers[i];
 
  child->nodes_searched = 0;
 
  child->fail_highs = 0;
 
  child->fail_high_first_move = 0;
 
  child->evaluations = 0;
 
  child->egtb_probes = 0;
 
  child->egtb_probes_successful = 0;
 
  child->extensions_done = 0;
 
  child->qchecks_done = 0;
 
  child->reductions_done = 0;
 
  child->moves_fpruned = 0;
 
  child->alpha = parent->alpha;
 
  child->beta = parent->beta;
 
  child->value = parent->value;
 
  child->side = parent->side;
 
  child->depth = parent->depth;
 
  parent->siblings[thread] = child;
 
  child->thread_id = thread;
 
  child->parent = parent;
 
  parent->nprocs++;
 
  strcpy_s(child->root_move_text, sizeof (child->root_move_text), parent->root_move_text); // Pierre-Marie Baty -- use safe version
 
  strcpy_s(child->remaining_moves_text, sizeof (child->remaining_moves_text), parent->remaining_moves_text); // Pierre-Marie Baty -- use safe version
 
}
 
 
 
/* modified 04/18/14 */
 
/*
 
 *******************************************************************************
 
 *                                                                             *
 
 *   CopyToParent() is used to copy data from a child thread to a parent       *
 
 *   thread.  This only copies the appropriate parts of the TREE structure to  *
 
 *   avoid burning memory bandwidth by copying everything.                     *
 
 *                                                                             *
 
 *******************************************************************************
 
 */
 
void CopyToParent(TREE * RESTRICT parent, TREE * RESTRICT child, int value) {
 
  int ply = parent->ply;
 
 
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  Only concern here is to make sure that the info is only *
 
 *  copied to the parent if our score is > than the parent  *
 
 *  value, and that we were not stopped for any reason      *
 
 *  which could produce a partial score that is worthless.  *
 
 *                                                          *
 
 *  In any case, we add our statistical counters to the     *
 
 *  parent's totals no matter whether we finished or not    *
 
 *  since the total nodes searched and such should consider *
 
 *  everything searched, not just the "useful stuff."       *
 
 *                                                          *
 
 ************************************************************
 
 */
 
  if (child->nodes_searched && !child->stop && value > parent->value &&
 
      !abort_search) {
 
    parent->pv[ply] = child->pv[ply];
 
    parent->value = value;
 
    parent->cutmove = child->curmv[ply];
 
  }
 
  parent->nodes_searched += child->nodes_searched;
 
  parent->fail_highs += child->fail_highs;
 
  parent->fail_high_first_move += child->fail_high_first_move;
 
  parent->evaluations += child->evaluations;
 
  parent->egtb_probes += child->egtb_probes;
 
  parent->egtb_probes_successful += child->egtb_probes_successful;
 
  parent->extensions_done += child->extensions_done;
 
  parent->qchecks_done += child->qchecks_done;
 
  parent->reductions_done += child->reductions_done;
 
  parent->moves_fpruned += child->moves_fpruned;
 
  child->used = 0;
 
}
 
 
 
/*
 
 *******************************************************************************
 
 *                                                                             *
 
 *   WaitForAllThreadsInitialized() waits until all smp_max_threads are        *
 
 *   initialized.  We have to initialize each thread and malloc() its split    *
 
 *   blocks before we start the actual parallel search.  Otherwise we will see *
 
 *   invalid memory accesses and crash instantly.                              *
 
 *                                                                             *
 
 *******************************************************************************
 
 */
 
void WaitForAllThreadsInitialized(void) {
 
  while (initialized_threads < smp_max_threads);        /* Do nothing */
 
}
 
 
 
/* modified 06/07/09 */
 
/*
 
 *******************************************************************************
 
 *                                                                             *
 
 *   ThreadInit() is called after a process is created.  Its main task is to   *
 
 *   initialize the process local memory so that it will fault in and be       *
 
 *   allocated on the local node rather than the node where the original       *
 
 *   (first) process was running.  All threads will hang here via a custom     *
 
 *   WaitForALlThreadsInitialized() procedure so that all the local thread     *
 
 *   blocks are usable before the search actually begins.                      *
 
 *                                                                             *
 
 *******************************************************************************
 
 */
 
void *STDCALL ThreadInit(void *tid) {
 
  int i, j;
 
#if defined(AFFINITY)
 
  int64_t k;
 
  cpu_set_t cpuset;
 
  pthread_t current_thread = pthread_self();
 
 
 
  k = (int64_t) tid;
 
  CPU_ZERO(&cpuset);
 
  CPU_SET(k, &cpuset);
 
  pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
 
#endif
 
#if !defined(UNIX)
 
  ThreadMalloc((uint64_t) tid);
 
#endif
 
  for (i = 0; i < MAX_BLOCKS_PER_CPU; i++) {
 
    memset((void *) block
[(uint64_t) tid 
* MAX_BLOCKS_PER_CPU 
+ i 
+ 1], 0,  
        sizeof(TREE));
 
    block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->used = 0;
 
    block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->parent = NULL;
 
    LockInit(block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->lock);
 
    for (j = 0; j < 64; j++)
 
      block[(uint64_t) tid * MAX_BLOCKS_PER_CPU + i + 1]->cache_n[j] = ~0ull;
 
  }
 
  Lock(lock_smp);
 
  initialized_threads++;
 
  Unlock(lock_smp);
 
  WaitForAllThreadsInitialized();
 
  ThreadWait((uint64_t) tid, (TREE *) 0);
 
  Lock(lock_smp);
 
  smp_threads--;
 
  Unlock(lock_smp);
 
  return 0;
 
}
 
 
 
#if !defined (UNIX)
 
/* modified 01/17/09 */
 
/*
 
 *******************************************************************************
 
 *                                                                             *
 
 *   ThreadMalloc() is called from the ThreadInit() function.  It malloc's the *
 
 *   split blocks in the local memory for the processor associated with the    *
 
 *   specific thread that is calling this code.                                *
 
 *                                                                             *
 
 *******************************************************************************
 
 */
 
extern void *WinMalloc(size_t, int);
 
void ThreadMalloc(int64_t tid) {
 
  int i, n = MAX_BLOCKS_PER_CPU;
 
 
 
  for (i = MAX_BLOCKS_PER_CPU * (int) (tid) + 1; n; i++, n--) { // Pierre-Marie Baty -- added type cast
 
    if (block[i] == NULL)
 
      block[i] =
 
          (TREE *) ((~(size_t) 127) & (127 + (size_t) WinMalloc(sizeof(TREE) +
 
                  127, (int) tid))); // Pierre-Marie Baty -- added type cast
 
    block[i]->used = 0;
 
    block[i]->parent = NULL;
 
    LockInit(block[i]->lock);
 
  }
 
}
 
#endif
 
 
 
/* modified 01/17/09 */
 
/*
 
 *******************************************************************************
 
 *                                                                             *
 
 *   ThreadStop() is called from SearchParallel() when it detects a beta       *
 
 *   cutoff (fail high) at a node that is being searched in parallel.  We need *
 
 *   to stop all threads here, and since this threading algorithm is recursive *
 
 *   it may be necessary to stop other threads that are helping search this    *
 
 *   branch further down into the tree.  This function simply sets tree->stop  *
 
 *   to 1, which will stop that particular thread instantly and return it to   *
 
 *   the idle loop in ThreadWait().                                            *
 
 *                                                                             *
 
 *******************************************************************************
 
 */
 
void ThreadStop(TREE * RESTRICT tree) {
 
  int proc;
 
 
 
  Lock(tree->lock);
 
  tree->stop = 1;
 
  for (proc = 0; proc < smp_max_threads; proc++)
 
    if (tree->siblings[proc])
 
      ThreadStop(tree->siblings[proc]);
 
  Unlock(tree->lock);
 
}
 
 
 
/* modified 04/23/14 */
 
/*
 
 *******************************************************************************
 
 *                                                                             *
 
 *   ThreadWait() is the idle loop for the N threads that are created at the   *
 
 *   beginning when Crafty searches.  Threads are "parked" here waiting on a   *
 
 *   pointer to something they should search (a parameter block built in the   *
 
 *   function Thread() in this case.  When this pointer becomes non-zero, each *
 
 *   thread "parked" here will immediately call SearchParallel() and begin the *
 
 *   parallel search as directed.                                              *
 
 *                                                                             *
 
 *******************************************************************************
 
 */
 
int ThreadWait(int64_t tid, TREE * RESTRICT waiting) {
 
  int value, tstart, tend;
 
 
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  The N initial threads enter here and are kept penned    *
 
 *  here forever.  However, once a thread leaves here it    *
 
 *  may well re-enter ThreadWait() from the top while it    *
 
 *  waits for a parallel search to complete.  While it      *
 
 *  waits here it can also join in to help other busy       *
 
 *  threads search their subtrees as well.                  *
 
 *                                                          *
 
 ************************************************************
 
 */
 
  while (1) {
 
    tstart = ReadClock();
 
    Lock(lock_split);
 
    Lock(lock_smp);
 
    smp_idle++;
 
    if (!waiting && smp_split == 0)
 
      smp_split = 1;
 
    if (!thread[tid].tree)
 
      thread[tid].idle = 1;
 
    Unlock(lock_smp);
 
    Unlock(lock_split);
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  We can exit if our thread[i] is non-zero, or if we are  *
 
 *  waiting on others to finish a block that *we* have to   *
 
 *  return through.  When the busy count on such a block    *
 
 *  hits zero, we return immediately which unwinds the      *
 
 *  search as it should be.                                 *
 
 *                                                          *
 
 ************************************************************
 
 */
 
    while (!thread[tid].tree && (!waiting || waiting->nprocs))
 
      Pause();
 
    thread[tid].idle = 0;
 
    tend = ReadClock();
 
    Lock(lock_smp);
 
    idle_time += tend - tstart;
 
    if (!thread[tid].tree)
 
      thread[tid].tree = waiting;
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  We either have work to do, or threads we were waiting   *
 
 *  on have finished their work.                            *
 
 *                                                          *
 
 ************************************************************
 
 */
 
    smp_idle--;
 
    Unlock(lock_smp);
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  If we are waiting on a block and the busy count is now  *
 
 *  zero, we simply return to finish up the bookkeeping at  *
 
 *  that point.                                             *
 
 *                                                          *
 
 ************************************************************
 
 */
 
    if (thread[tid].tree == waiting || thread[tid].tree == (TREE *) - 1)
 
      return 0;
 
/*
 
 ************************************************************
 
 *                                                          *
 
 *  Else our thread[i] pointer is non-zero, meaning we have *
 
 *  been assigned something to search.  Hi-ho, hi-ho, it's  *
 
 *  off to work we go...                                    *
 
 *                                                          *
 
 ************************************************************
 
 */
 
    value =
 
        SearchParallel(thread[tid].tree, thread[tid].tree->alpha,
 
        thread[tid].tree->beta, thread[tid].tree->value,
 
        thread[tid].tree->side, thread[tid].tree->depth,
 
        thread[tid].tree->ply);
 
    Lock(thread[tid].tree->parent->lock);
 
    CopyToParent((TREE *) thread[tid].tree->parent, thread[tid].tree, value);
 
    thread[tid].tree->parent->nprocs--;
 
    thread[tid].tree->parent->siblings[tid] = 0;
 
    Unlock(thread[tid].tree->parent->lock);
 
    thread[tid].tree = 0;
 
  }
 
}