function SearchIndex::index

Same name and namespace in other branches
  1. 9 core/modules/search/src/SearchIndex.php \Drupal\search\SearchIndex::index()
  2. 8.9.x core/modules/search/src/SearchIndex.php \Drupal\search\SearchIndex::index()
  3. 11.x core/modules/search/src/SearchIndex.php \Drupal\search\SearchIndex::index()

Updates the full-text search index for a particular item.

Parameters

string $type: The plugin ID or other machine-readable type of this item, which should be less than 64 bytes.

int $sid: An ID number identifying this particular item (e.g., node ID).

string $langcode: Language code for the language of the text being indexed.

string $text: The content of this item. Must be a piece of HTML or plain text.

bool $update_weights: (optional) TRUE if word weights should be updated. FALSE otherwise; defaults to TRUE. If you pass in FALSE, then you need to have your calls to this method in a try/finally block, and at the end of your index run in the finally clause, you will need to call self::updateWordWeights(), passing in all of the returned words, to update the word weights.

Return value

string[] The words to be updated.

Overrides SearchIndexInterface::index

File

core/modules/search/src/SearchIndex.php, line 89

Class

SearchIndex
Provides search index management functions.

Namespace

Drupal\search

Code

public function index($type, $sid, $langcode, $text, $update_weights = TRUE) {
  $settings = $this->configFactory
    ->get('search.settings');
  $minimum_word_size = $settings->get('index.minimum_word_size');
  // Keep track of the words that need to have their weights updated.
  $current_words = [];
  // Multipliers for scores of words inside certain HTML tags. The weights are
  // stored in config so that modules can overwrite the default weights.
  // Note: 'a' must be included for link ranking to work.
  $tags = $settings->get('index.tag_weights');
  // Strip off all ignored tags to speed up processing, but insert space
  // before and after them to keep word boundaries.
  $text = str_replace([
    '<',
    '>',
  ], [
    ' <',
    '> ',
  ], $text);
  $text = strip_tags($text, '<' . implode('><', array_keys($tags)) . '>');
  // Split HTML tags from plain text.
  $split = preg_split('/\\s*<([^>]+?)>\\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  // Note: PHP ensures the array consists of alternating delimiters and
  // literals and begins and ends with a literal (inserting $null as
  // required).
  // Odd/even counter. Tag or no tag.
  $tag = FALSE;
  // Starting score per word.
  $score = 1;
  // Accumulator for cleaned up data.
  $accumulator = ' ';
  // Stack with open tags.
  $tag_stack = [];
  // Counter for consecutive words.
  $tag_words = 0;
  // Focus state.
  $focus = 1;
  // Accumulator for words for index.
  $scored_words = [];
  foreach ($split as $value) {
    if ($tag) {
      // Increase or decrease score per word based on tag.
      [
        $tagname,
      ] = explode(' ', $value, 2);
      $tagname = mb_strtolower($tagname);
      // Closing or opening tag?
      if ($tagname[0] == '/') {
        $tagname = substr($tagname, 1);
        // If we encounter unexpected tags, reset score to avoid incorrect
        // boosting.
        if (!count($tag_stack) || $tag_stack[0] != $tagname) {
          $tag_stack = [];
          $score = 1;
        }
        else {
          // Remove from tag stack and decrement score.
          $score = max(1, $score - $tags[array_shift($tag_stack)]);
        }
      }
      else {
        if (isset($tag_stack[0]) && $tag_stack[0] == $tagname) {
          // None of the tags we look for make sense when nested identically.
          // If they are, it's probably broken HTML.
          $tag_stack = [];
          $score = 1;
        }
        else {
          // Add to open tag stack and increment score.
          array_unshift($tag_stack, $tagname);
          $score += $tags[$tagname];
        }
      }
      // A tag change occurred, reset counter.
      $tag_words = 0;
    }
    else {
      // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty
      // values.
      if ($value != '') {
        $words = $this->textProcessor
          ->process($value, $langcode);
        foreach ($words as $word) {
          // Add word to accumulator.
          $accumulator .= $word . ' ';
          // Check word length.
          if (is_numeric($word) || mb_strlen($word) >= $minimum_word_size) {
            if (!isset($scored_words[$word])) {
              $scored_words[$word] = 0;
            }
            $scored_words[$word] += $score * $focus;
            // Focus is a decaying value in terms of the amount of unique
            // words up to this point. From 100 words and more, it decays, to
            // e.g. 0.5 at 500 words and 0.3 at 1000 words.
            $focus = min(1, 0.01 + 3.5 / (2 + count($scored_words) * 0.015));
          }
          $tag_words++;
          // Too many words inside a single tag probably mean a tag was
          // accidentally left open.
          if (count($tag_stack) && $tag_words >= 15) {
            $tag_stack = [];
            $score = 1;
          }
        }
      }
    }
    $tag = !$tag;
  }
  // Remove the item $sid from the search index, and invalidate the relevant
  // cache tags.
  $this->clear($type, $sid, $langcode);
  try {
    // Insert cleaned up data into dataset.
    $this->connection
      ->insert('search_dataset')
      ->fields([
      'sid' => $sid,
      'langcode' => $langcode,
      'type' => $type,
      'data' => $accumulator,
      'reindex' => 0,
    ])
      ->execute();
    // Insert results into search index.
    foreach ($scored_words as $word => $score) {
      // If a word already exists in the database, its score gets increased
      // appropriately. If not, we create a new record with the appropriate
      // starting score.
      $this->connection
        ->merge('search_index')
        ->keys([
        'word' => $word,
        'sid' => $sid,
        'langcode' => $langcode,
        'type' => $type,
      ])
        ->fields([
        'score' => $score,
      ])
        ->expression('score', '[score] + :score', [
        ':score' => $score,
      ])
        ->execute();
      $current_words[$word] = TRUE;
    }
  } catch (\Exception $e) {
    throw new SearchIndexException("Failed to insert dataset in index for type '{$type}', sid '{$sid}' and langcode '{$langcode}'", 0, $e);
  } finally {
    if ($update_weights) {
      $this->updateWordWeights($current_words);
    }
  }
  return $current_words;
}

Buggy or inaccurate documentation? Please file an issue. Need support? Need help programming? Connect with the Drupal community.