class SearchIndex

Same name and namespace in other branches
  1. 9 core/modules/search/src/SearchIndex.php \Drupal\search\SearchIndex
  2. 8.9.x core/modules/search/src/SearchIndex.php \Drupal\search\SearchIndex
  3. 11.x core/modules/search/src/SearchIndex.php \Drupal\search\SearchIndex

Provides search index management functions.

Hierarchy

Expanded class hierarchy of SearchIndex

1 string reference to 'SearchIndex'
search.services.yml in core/modules/search/search.services.yml
core/modules/search/search.services.yml
1 service uses SearchIndex
search.index in core/modules/search/search.services.yml
Drupal\search\SearchIndex

File

core/modules/search/src/SearchIndex.php, line 14

Namespace

Drupal\search
View source
class SearchIndex implements SearchIndexInterface {
  
  /**
   * The config factory.
   *
   * @var \Drupal\Core\Config\ConfigFactoryInterface
   */
  protected $configFactory;
  
  /**
   * The database connection.
   *
   * @var \Drupal\Core\Database\Connection
   */
  protected $connection;
  
  /**
   * The database replica connection.
   *
   * @var \Drupal\Core\Database\Connection
   */
  protected $replica;
  
  /**
   * The cache tags invalidator.
   *
   * @var \Drupal\Core\Cache\CacheTagsInvalidatorInterface
   */
  protected $cacheTagsInvalidator;
  
  /**
   * The text processor.
   *
   * @var \Drupal\search\SearchTextProcessorInterface
   */
  protected $textProcessor;
  
  /**
   * SearchIndex constructor.
   *
   * @param \Drupal\Core\Config\ConfigFactoryInterface $config_factory
   *   The config factory.
   * @param \Drupal\Core\Database\Connection $connection
   *   The database connection.
   * @param \Drupal\Core\Database\Connection $replica
   *   The database replica connection.
   * @param \Drupal\Core\Cache\CacheTagsInvalidatorInterface $cache_tags_invalidator
   *   The cache tags invalidator.
   * @param \Drupal\search\SearchTextProcessorInterface $text_processor
   *   The text processor.
   * @param \Drupal\Component\Datetime\TimeInterface|null $time
   *   The time service
   */
  public function __construct(ConfigFactoryInterface $config_factory, Connection $connection, Connection $replica, CacheTagsInvalidatorInterface $cache_tags_invalidator, SearchTextProcessorInterface $text_processor, protected ?TimeInterface $time = NULL) {
    $this->configFactory = $config_factory;
    $this->connection = $connection;
    $this->replica = $replica;
    $this->cacheTagsInvalidator = $cache_tags_invalidator;
    $this->textProcessor = $text_processor;
    if (!$time) {
      @trigger_error('Calling ' . __METHOD__ . '() without the $time argument is deprecated in drupal:10.3.0 and it will be required in drupal:11.0.0. See https://www.drupal.org/node/3387233', E_USER_DEPRECATED);
      $this->time = \Drupal::service(TimeInterface::class);
    }
  }
  
  /**
   * {@inheritdoc}
   */
  public function index($type, $sid, $langcode, $text, $update_weights = TRUE) {
    $settings = $this->configFactory
      ->get('search.settings');
    $minimum_word_size = $settings->get('index.minimum_word_size');
    // Keep track of the words that need to have their weights updated.
    $current_words = [];
    // Multipliers for scores of words inside certain HTML tags. The weights are
    // stored in config so that modules can overwrite the default weights.
    // Note: 'a' must be included for link ranking to work.
    $tags = $settings->get('index.tag_weights');
    // Strip off all ignored tags to speed up processing, but insert space
    // before and after them to keep word boundaries.
    $text = str_replace([
      '<',
      '>',
    ], [
      ' <',
      '> ',
    ], $text);
    $text = strip_tags($text, '<' . implode('><', array_keys($tags)) . '>');
    // Split HTML tags from plain text.
    $split = preg_split('/\\s*<([^>]+?)>\\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
    // Note: PHP ensures the array consists of alternating delimiters and
    // literals and begins and ends with a literal (inserting $null as
    // required).
    // Odd/even counter. Tag or no tag.
    $tag = FALSE;
    // Starting score per word.
    $score = 1;
    // Accumulator for cleaned up data.
    $accumulator = ' ';
    // Stack with open tags.
    $tag_stack = [];
    // Counter for consecutive words.
    $tag_words = 0;
    // Focus state.
    $focus = 1;
    // Accumulator for words for index.
    $scored_words = [];
    foreach ($split as $value) {
      if ($tag) {
        // Increase or decrease score per word based on tag.
        [
          $tagname,
        ] = explode(' ', $value, 2);
        $tagname = mb_strtolower($tagname);
        // Closing or opening tag?
        if ($tagname[0] == '/') {
          $tagname = substr($tagname, 1);
          // If we encounter unexpected tags, reset score to avoid incorrect
          // boosting.
          if (!count($tag_stack) || $tag_stack[0] != $tagname) {
            $tag_stack = [];
            $score = 1;
          }
          else {
            // Remove from tag stack and decrement score.
            $score = max(1, $score - $tags[array_shift($tag_stack)]);
          }
        }
        else {
          if (isset($tag_stack[0]) && $tag_stack[0] == $tagname) {
            // None of the tags we look for make sense when nested identically.
            // If they are, it's probably broken HTML.
            $tag_stack = [];
            $score = 1;
          }
          else {
            // Add to open tag stack and increment score.
            array_unshift($tag_stack, $tagname);
            $score += $tags[$tagname];
          }
        }
        // A tag change occurred, reset counter.
        $tag_words = 0;
      }
      else {
        // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty
        // values.
        if ($value != '') {
          $words = $this->textProcessor
            ->process($value, $langcode);
          foreach ($words as $word) {
            // Add word to accumulator.
            $accumulator .= $word . ' ';
            // Check word length.
            if (is_numeric($word) || mb_strlen($word) >= $minimum_word_size) {
              if (!isset($scored_words[$word])) {
                $scored_words[$word] = 0;
              }
              $scored_words[$word] += $score * $focus;
              // Focus is a decaying value in terms of the amount of unique
              // words up to this point. From 100 words and more, it decays, to
              // e.g. 0.5 at 500 words and 0.3 at 1000 words.
              $focus = min(1, 0.01 + 3.5 / (2 + count($scored_words) * 0.015));
            }
            $tag_words++;
            // Too many words inside a single tag probably mean a tag was
            // accidentally left open.
            if (count($tag_stack) && $tag_words >= 15) {
              $tag_stack = [];
              $score = 1;
            }
          }
        }
      }
      $tag = !$tag;
    }
    // Remove the item $sid from the search index, and invalidate the relevant
    // cache tags.
    $this->clear($type, $sid, $langcode);
    try {
      // Insert cleaned up data into dataset.
      $this->connection
        ->insert('search_dataset')
        ->fields([
        'sid' => $sid,
        'langcode' => $langcode,
        'type' => $type,
        'data' => $accumulator,
        'reindex' => 0,
      ])
        ->execute();
      // Insert results into search index.
      foreach ($scored_words as $word => $score) {
        // If a word already exists in the database, its score gets increased
        // appropriately. If not, we create a new record with the appropriate
        // starting score.
        $this->connection
          ->merge('search_index')
          ->keys([
          'word' => $word,
          'sid' => $sid,
          'langcode' => $langcode,
          'type' => $type,
        ])
          ->fields([
          'score' => $score,
        ])
          ->expression('score', '[score] + :score', [
          ':score' => $score,
        ])
          ->execute();
        $current_words[$word] = TRUE;
      }
    } catch (\Exception $e) {
      throw new SearchIndexException("Failed to insert dataset in index for type '{$type}', sid '{$sid}' and langcode '{$langcode}'", 0, $e);
    } finally {
      if ($update_weights) {
        $this->updateWordWeights($current_words);
      }
    }
    return $current_words;
  }
  
  /**
   * {@inheritdoc}
   */
  public function clear($type = NULL, $sid = NULL, $langcode = NULL) {
    try {
      $query_index = $this->connection
        ->delete('search_index');
      $query_dataset = $this->connection
        ->delete('search_dataset');
      if ($type) {
        $query_index->condition('type', $type);
        $query_dataset->condition('type', $type);
        if ($sid) {
          $query_index->condition('sid', $sid);
          $query_dataset->condition('sid', $sid);
          if ($langcode) {
            $query_index->condition('langcode', $langcode);
            $query_dataset->condition('langcode', $langcode);
          }
        }
      }
      $query_index->execute();
      $query_dataset->execute();
    } catch (\Exception $e) {
      throw new SearchIndexException("Failed to clear index for type '{$type}', sid '{$sid}' and langcode '{$langcode}'", 0, $e);
    }
    if ($type) {
      // Invalidate all render cache items that contain data from this index.
      $this->cacheTagsInvalidator
        ->invalidateTags([
        'search_index:' . $type,
      ]);
    }
    else {
      // Invalidate all render cache items that contain data from any index.
      $this->cacheTagsInvalidator
        ->invalidateTags([
        'search_index',
      ]);
    }
  }
  
  /**
   * {@inheritdoc}
   */
  public function markForReindex($type = NULL, $sid = NULL, $langcode = NULL) {
    try {
      $query = $this->connection
        ->update('search_dataset')
        ->fields([
        'reindex' => $this->time
          ->getRequestTime(),
      ])
        ->condition('reindex', 0);
      if ($type) {
        $query->condition('type', $type);
        if ($sid) {
          $query->condition('sid', $sid);
          if ($langcode) {
            $query->condition('langcode', $langcode);
          }
        }
      }
      $query->execute();
    } catch (\Exception $e) {
      throw new SearchIndexException("Failed to mark index for re-indexing for type '{$type}', sid '{$sid}' and langcode '{$langcode}'", 0, $e);
    }
  }
  
  /**
   * {@inheritdoc}
   */
  public function updateWordWeights(array $words) {
    try {
      // Update word IDF (Inverse Document Frequency) counts for new/changed
      // words.
      $words = array_keys($words);
      foreach ($words as $word) {
        // Get total count.
        $total = $this->replica
          ->query("SELECT SUM([score]) FROM {search_index} WHERE [word] = :word", [
          ':word' => $word,
        ])
          ->fetchField();
        // Apply Zipf's law to equalize the probability distribution.
        $total = log10(1 + 1 / max(1, $total));
        $this->connection
          ->merge('search_total')
          ->key('word', $word)
          ->fields([
          'count' => $total,
        ])
          ->execute();
      }
      // Find words that were deleted from search_index, but are still in
      // search_total. We use a LEFT JOIN between the two tables and keep only
      // the rows which fail to join.
      $result = $this->replica
        ->query("SELECT [t].[word] AS [realword], [i].[word] FROM {search_total} [t] LEFT JOIN {search_index} [i] ON [t].[word] = [i].[word] WHERE [i].[word] IS NULL");
      $or = $this->replica
        ->condition('OR');
      foreach ($result as $word) {
        $or->condition('word', $word->realword);
      }
      if (count($or) > 0) {
        $this->connection
          ->delete('search_total')
          ->condition($or)
          ->execute();
      }
    } catch (\Exception $e) {
      throw new SearchIndexException("Failed to update totals for index words.", 0, $e);
    }
  }

}

Members

Title Sort descending Modifiers Object type Summary Overriden Title
SearchIndex::$cacheTagsInvalidator protected property The cache tags invalidator.
SearchIndex::$configFactory protected property The config factory.
SearchIndex::$connection protected property The database connection.
SearchIndex::$replica protected property The database replica connection.
SearchIndex::$textProcessor protected property The text processor.
SearchIndex::clear public function Clears either a part of, or the entire search index. Overrides SearchIndexInterface::clear
SearchIndex::index public function Updates the full-text search index for a particular item. Overrides SearchIndexInterface::index
SearchIndex::markForReindex public function Changes the timestamp on indexed items to &#039;now&#039; to force reindexing. Overrides SearchIndexInterface::markForReindex
SearchIndex::updateWordWeights public function Updates the {search_total} database table. Overrides SearchIndexInterface::updateWordWeights
SearchIndex::__construct public function SearchIndex constructor.

Buggy or inaccurate documentation? Please file an issue. Need support? Need help programming? Connect with the Drupal community.