Add more accurate account search (#11537)

* Add more accurate account search

When ElasticSearch is available, a more accurate search is implemented:

- Using edge n-gram index for acct and display name
- Using asciifolding and cjk width normalization on display names
- Using Gaussian decay on account activity for additional scoring (recency)
- Using followers/friends ratio for additional scoring (spamminess)
- Using followers number for additional scoring (size)

The exact match precedence only takes effect when the input conforms
to the username format and the username part of it is complete, i.e.
when the user started typing the domain part.

* Support single-letter usernames

* Fix tests

* Fix not picking up account updates

* Add weights and normalization for scores, skip zero terms queries

* Use local counts for accounts index, adjust search parameters

* Fix mistakes

* Using updated_at of accounts is inadequate for remote accounts
main
Eugen Rochko 5 years ago committed by GitHub
parent 2ca6b2bb6c
commit 8fdff2748f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,36 @@
# frozen_string_literal: true
class AccountsIndex < Chewy::Index
settings index: { refresh_interval: '5m' }, analysis: {
analyzer: {
content: {
tokenizer: 'whitespace',
filter: %w(lowercase asciifolding cjk_width),
},
edge_ngram: {
tokenizer: 'edge_ngram',
filter: %w(lowercase asciifolding cjk_width),
},
},
tokenizer: {
edge_ngram: {
type: 'edge_ngram',
min_gram: 1,
max_gram: 15,
},
},
}
define_type ::Account.searchable.includes(:account_stat), delete_if: ->(account) { account.destroyed? || !account.searchable? } do
root date_detection: false do
field :id, type: 'long'
field :display_name, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
field :acct, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content', value: ->(account) { [account.username, account.domain].compact.join('@') }
field :following_count, type: 'long', value: ->(account) { account.active_relationships.count }
field :followers_count, type: 'long', value: ->(account) { account.passive_relationships.count }
field :last_status_at, type: 'date', value: ->(account) { account.last_status_at || account.created_at }
end
end
end

@ -127,6 +127,8 @@ class Account < ApplicationRecord
delegate :chosen_languages, to: :user, prefix: false, allow_nil: true delegate :chosen_languages, to: :user, prefix: false, allow_nil: true
update_index('accounts#account', :self) if Chewy.enabled?
def local? def local?
domain.nil? domain.nil?
end end
@ -169,6 +171,10 @@ class Account < ApplicationRecord
subscription_expires_at.present? subscription_expires_at.present?
end end
def searchable?
!(suspended? || moved?)
end
def possibly_stale? def possibly_stale?
last_webfingered_at.nil? || last_webfingered_at <= 1.day.ago last_webfingered_at.nil? || last_webfingered_at <= 1.day.ago
end end

@ -16,6 +16,8 @@
class AccountStat < ApplicationRecord class AccountStat < ApplicationRecord
belongs_to :account, inverse_of: :account_stat belongs_to :account, inverse_of: :account_stat
update_index('accounts#account', :account) if Chewy.enabled?
def increment_count!(key) def increment_count!(key)
update(attributes_for_increment(key)) update(attributes_for_increment(key))
end end

@ -4,105 +4,134 @@ class AccountSearchService < BaseService
attr_reader :query, :limit, :offset, :options, :account attr_reader :query, :limit, :offset, :options, :account
def call(query, account = nil, options = {}) def call(query, account = nil, options = {})
@query = query.strip @acct_hint = query.start_with?('@')
@query = query.strip.gsub(/\A@/, '')
@limit = options[:limit].to_i @limit = options[:limit].to_i
@offset = options[:offset].to_i @offset = options[:offset].to_i
@options = options @options = options
@account = account @account = account
search_service_results search_service_results.compact.uniq
end end
private private
def search_service_results def search_service_results
return [] if query_blank_or_hashtag? || limit < 1 return [] if query.blank? || limit < 1
if resolving_non_matching_remote_account? [exact_match] + search_results
[ResolveAccountService.new.call("#{query_username}@#{query_domain}")].compact
else
search_results_and_exact_match.compact.uniq
end
end
def resolving_non_matching_remote_account?
offset.zero? && options[:resolve] && !exact_match? && !domain_is_local?
end end
def search_results_and_exact_match def exact_match
return search_results.to_a unless offset.zero? return unless offset.zero? && username_complete?
results = [exact_match]
return results if exact_match? && limit == 1 return @exact_match if defined?(@exact_match)
results + search_results.to_a @exact_match = begin
if options[:resolve]
ResolveAccountService.new.call(query)
elsif domain_is_local?
Account.find_local(query_username)
else
Account.find_remote(query_username, query_domain)
end end
def query_blank_or_hashtag?
query.blank? || query.start_with?('#')
end end
def split_query_string
@split_query_string ||= query.gsub(/\A@/, '').split('@')
end end
def query_username def search_results
@query_username ||= split_query_string.first || '' return [] if limit_for_non_exact_results.zero?
end
def query_domain @search_results ||= begin
@query_domain ||= query_without_split? ? nil : split_query_string.last if Chewy.enabled?
from_elasticsearch
else
from_database
end
end end
def query_without_split?
split_query_string.size == 1
end end
def domain_is_local? def from_database
@domain_is_local ||= TagManager.instance.local_domain?(query_domain) if account
advanced_search_results
else
simple_search_results
end
end end
def search_from def advanced_search_results
options[:following] && account ? account.following : Account Account.advanced_search_for(terms_for_query, account, limit_for_non_exact_results, options[:following], offset)
end end
def exact_match? def simple_search_results
exact_match.present? Account.search_for(terms_for_query, limit_for_non_exact_results, offset)
end end
def exact_match def from_elasticsearch
return @exact_match if defined?(@exact_match) must_clauses = [{ multi_match: { query: terms_for_query, fields: likely_acct? ? %w(acct) : %w(acct^2 display_name), type: 'best_fields' } }]
should_clauses = []
@exact_match = begin if account
if domain_is_local? return [] if options[:following] && following_ids.empty?
search_from.without_suspended.find_local(query_username)
else if options[:following]
search_from.without_suspended.find_remote(query_username, query_domain) must_clauses << { terms: { id: following_ids } }
end elsif following_ids.any?
should_clauses << { terms: { id: following_ids, boost: 100 } }
end end
end end
def search_results query = { bool: { must: must_clauses, should: should_clauses } }
@search_results ||= begin functions = [reputation_score_function, followers_score_function, time_distance_function]
if account
advanced_search_results records = AccountsIndex.query(function_score: { query: query, functions: functions, boost_mode: 'multiply', score_mode: 'avg' })
else .limit(limit_for_non_exact_results)
simple_search_results .offset(offset)
.objects
.compact
ActiveRecord::Associations::Preloader.new.preload(records, :account_stat)
records
end end
def reputation_score_function
{
script_score: {
script: {
source: "(doc['followers_count'].value + 0.0) / (doc['followers_count'].value + doc['following_count'].value + 1)",
},
},
}
end end
def followers_score_function
{
field_value_factor: {
field: 'followers_count',
modifier: 'log2p',
missing: 1,
},
}
end end
def advanced_search_results def time_distance_function
Account.advanced_search_for(terms_for_query, account, limit_for_non_exact_results, options[:following], offset) {
gauss: {
last_status_at: {
scale: '30d',
offset: '30d',
decay: 0.3,
},
},
}
end end
def simple_search_results def following_ids
Account.search_for(terms_for_query, limit_for_non_exact_results, offset) @following_ids ||= account.active_relationships.pluck(:target_account_id)
end end
def limit_for_non_exact_results def limit_for_non_exact_results
if offset.zero? && exact_match? if exact_match?
limit - 1 limit - 1
else else
limit limit
@ -113,7 +142,39 @@ class AccountSearchService < BaseService
if domain_is_local? if domain_is_local?
query_username query_username
else else
"#{query_username} #{query_domain}" query
end
end end
def split_query_string
@split_query_string ||= query.split('@')
end
def query_username
@query_username ||= split_query_string.first || ''
end
def query_domain
@query_domain ||= query_without_split? ? nil : split_query_string.last
end
def query_without_split?
split_query_string.size == 1
end
def domain_is_local?
@domain_is_local ||= TagManager.instance.local_domain?(query_domain)
end
def exact_match?
exact_match.present?
end
def username_complete?
query.include?('@') && "@#{query}" =~ Account::MENTION_RE
end
def likely_acct?
@acct_hint || username_complete?
end end
end end

@ -1,110 +1,38 @@
require 'rails_helper' require 'rails_helper'
describe AccountSearchService, type: :service do describe AccountSearchService, type: :service do
describe '.call' do describe '#call' do
describe 'with a query to ignore' do context 'with a query to ignore' do
it 'returns empty array for missing query' do it 'returns empty array for missing query' do
results = subject.call('', nil, limit: 10) results = subject.call('', nil, limit: 10)
expect(results).to eq [] expect(results).to eq []
end end
it 'returns empty array for hashtag query' do
results = subject.call('#tag', nil, limit: 10)
expect(results).to eq []
end
it 'returns empty array for limit zero' do it 'returns empty array for limit zero' do
Fabricate(:account, username: 'match') Fabricate(:account, username: 'match')
results = subject.call('match', nil, limit: 0) results = subject.call('match', nil, limit: 0)
expect(results).to eq [] expect(results).to eq []
end end
end end
describe 'searching for a simple term that is not an exact match' do context 'searching for a simple term that is not an exact match' do
it 'does not return a nil entry in the array for the exact match' do it 'does not return a nil entry in the array for the exact match' do
match = Fabricate(:account, username: 'matchingusername') account = Fabricate(:account, username: 'matchingusername')
results = subject.call('match', nil, limit: 5) results = subject.call('match', nil, limit: 5)
expect(results).to eq [match]
end
end
describe 'searching local and remote users' do
describe "when only '@'" do
before do
allow(Account).to receive(:find_local)
allow(Account).to receive(:search_for)
subject.call('@', nil, limit: 10)
end
it 'uses find_local with empty query to look for local accounts' do
expect(Account).to have_received(:find_local).with('')
end
end
describe 'when no domain' do
before do
allow(Account).to receive(:find_local)
allow(Account).to receive(:search_for)
subject.call('one', nil, limit: 10)
end
it 'uses find_local to look for local accounts' do
expect(Account).to have_received(:find_local).with('one')
end
it 'uses search_for to find matches' do
expect(Account).to have_received(:search_for).with('one', 10, 0)
end
end
describe 'when there is a domain' do
before do
allow(Account).to receive(:find_remote)
end
it 'uses find_remote to look for remote accounts' do
subject.call('two@example.com', nil, limit: 10)
expect(Account).to have_received(:find_remote).with('two', 'example.com')
end
describe 'and there is no account provided' do
it 'uses search_for to find matches' do
allow(Account).to receive(:search_for)
subject.call('two@example.com', nil, limit: 10, resolve: false)
expect(Account).to have_received(:search_for).with('two example.com', 10, 0) expect(results).to eq [account]
end end
end end
describe 'and there is an account provided' do context 'when there is a local domain' do
it 'uses advanced_search_for to find matches' do
account = Fabricate(:account)
allow(Account).to receive(:advanced_search_for)
subject.call('two@example.com', account, limit: 10, resolve: false)
expect(Account).to have_received(:advanced_search_for).with('two example.com', account, 10, nil, 0)
end
end
end
end
describe 'with an exact match' do
it 'returns exact match first, and does not return duplicates' do
partial = Fabricate(:account, username: 'exactness')
exact = Fabricate(:account, username: 'exact')
results = subject.call('exact', nil, limit: 10)
expect(results.size).to eq 2
expect(results).to eq [exact, partial]
end
end
describe 'when there is a local domain' do
around do |example| around do |example|
before = Rails.configuration.x.local_domain before = Rails.configuration.x.local_domain
example.run example.run
Rails.configuration.x.local_domain = before Rails.configuration.x.local_domain = before
end end
@ -112,15 +40,17 @@ describe AccountSearchService, type: :service do
remote = Fabricate(:account, username: 'a', domain: 'remote', display_name: 'e') remote = Fabricate(:account, username: 'a', domain: 'remote', display_name: 'e')
remote_too = Fabricate(:account, username: 'b', domain: 'remote', display_name: 'e') remote_too = Fabricate(:account, username: 'b', domain: 'remote', display_name: 'e')
exact = Fabricate(:account, username: 'e') exact = Fabricate(:account, username: 'e')
Rails.configuration.x.local_domain = 'example.com' Rails.configuration.x.local_domain = 'example.com'
results = subject.call('e@example.com', nil, limit: 2) results = subject.call('e@example.com', nil, limit: 2)
expect(results.size).to eq 2 expect(results.size).to eq 2
expect(results).to eq([exact, remote]).or eq([exact, remote_too]) expect(results).to eq([exact, remote]).or eq([exact, remote_too])
end end
end end
describe 'when there is a domain but no exact match' do context 'when there is a domain but no exact match' do
it 'follows the remote account when resolve is true' do it 'follows the remote account when resolve is true' do
service = double(call: nil) service = double(call: nil)
allow(ResolveAccountService).to receive(:new).and_return(service) allow(ResolveAccountService).to receive(:new).and_return(service)
@ -138,23 +68,21 @@ describe AccountSearchService, type: :service do
end end
end end
describe 'should not include suspended accounts' do
it 'returns the fuzzy match first, and does not return suspended exacts' do it 'returns the fuzzy match first, and does not return suspended exacts' do
partial = Fabricate(:account, username: 'exactness') partial = Fabricate(:account, username: 'exactness')
exact = Fabricate(:account, username: 'exact', suspended: true) exact = Fabricate(:account, username: 'exact', suspended: true)
results = subject.call('exact', nil, limit: 10) results = subject.call('exact', nil, limit: 10)
expect(results.size).to eq 1 expect(results.size).to eq 1
expect(results).to eq [partial] expect(results).to eq [partial]
end end
it "does not return suspended remote accounts" do it "does not return suspended remote accounts" do
remote = Fabricate(:account, username: 'a', domain: 'remote', display_name: 'e', suspended: true) remote = Fabricate(:account, username: 'a', domain: 'remote', display_name: 'e', suspended: true)
results = subject.call('a@example.com', nil, limit: 2) results = subject.call('a@example.com', nil, limit: 2)
expect(results.size).to eq 0 expect(results.size).to eq 0
expect(results).to eq [] expect(results).to eq []
end end
end end
end
end end

Loading…
Cancel
Save