Change how hashtags are normalized (#18795)

* Change how hashtags are normalized

* Fix tests
This commit is contained in:
Eugen Rochko 2022-07-13 15:03:28 +02:00 committed by GitHub
parent 89ef936126
commit 38d04135bf
29 changed files with 193 additions and 51 deletions

View file

@ -16,6 +16,8 @@ module Admin
if @tag.update(tag_params.merge(reviewed_at: Time.now.utc))
redirect_to admin_tag_path(@tag.id), notice: I18n.t('admin.tags.updated_msg')
else
@time_period = (6.days.ago.to_date...Time.now.utc.to_date)
render :show
end
end
@ -27,7 +29,7 @@ module Admin
end
def tag_params
params.require(:tag).permit(:name, :trendable, :usable, :listable)
params.require(:tag).permit(:name, :display_name, :trendable, :usable, :listable)
end
end
end

View file

@ -13,9 +13,7 @@ class Api::V1::FeaturedTagsController < Api::BaseController
end
def create
@featured_tag = current_account.featured_tags.new(featured_tag_params)
@featured_tag.reset_data
@featured_tag.save!
@featured_tag = current_account.featured_tags.create!(featured_tag_params)
render json: @featured_tag, serializer: REST::FeaturedTagSerializer
end

View file

@ -11,7 +11,6 @@ class Settings::FeaturedTagsController < Settings::BaseController
def create
@featured_tag = current_account.featured_tags.new(featured_tag_params)
@featured_tag.reset_data
if @featured_tag.save
redirect_to settings_featured_tags_path

View file

@ -606,7 +606,20 @@ function insertIntoTagHistory(recognizedTags, text) {
const state = getState();
const oldHistory = state.getIn(['compose', 'tagHistory']);
const me = state.getIn(['meta', 'me']);
const names = recognizedTags.map(tag => text.match(new RegExp(`#${tag.name}`, 'i'))[0].slice(1));
// FIXME: Matching input hashtags with recognized hashtags has become more
// complicated because of new normalization rules, it's no longer just
// a case sensitivity issue
const names = recognizedTags.map(tag => {
const matches = text.match(new RegExp(`#${tag.name}`, 'i'));
if (matches && matches.length > 0) {
return matches[0].slice(1);
} else {
return tag.name;
}
});
const intersectedOldHistory = oldHistory.filter(name => names.findIndex(newName => newName.toLowerCase() === name.toLowerCase()) === -1);
names.push(...intersectedOldHistory.toJS());

10
app/lib/ascii_folding.rb Normal file
View file

@ -0,0 +1,10 @@
# frozen_string_literal: true
class ASCIIFolding
NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
def fold(str)
str.tr(NON_ASCII_CHARS, EQUIVALENT_ASCII_CHARS)
end
end

View file

@ -0,0 +1,25 @@
# frozen_string_literal: true
class HashtagNormalizer
def normalize(str)
remove_invalid_characters(ascii_folding(lowercase(cjk_width(str))))
end
private
def remove_invalid_characters(str)
str.gsub(/[^[:alnum:]#{Tag::HASHTAG_SEPARATORS}]/, '')
end
def ascii_folding(str)
ASCIIFolding.new.fold(str)
end
def lowercase(str)
str.mb_chars.downcase.to_s
end
def cjk_width(str)
str.unicode_normalize(:nfkc)
end
end

View file

@ -62,7 +62,7 @@ class Account < ApplicationRecord
)
USERNAME_RE = /[a-z0-9_]+([a-z0-9_\.-]+[a-z0-9_]+)?/i
MENTION_RE = /(?<=^|[^\/[:word:]])@((#{USERNAME_RE})(?:@[[:word:]\.\-]+[[:word:]]+)?)/i
MENTION_RE = /(?<=^|[^\/[:word:]])@((#{USERNAME_RE})(?:@[[:alnum:]\.\-]+[[:alnum:]]+)?)/i
URL_PREFIX_RE = /\Ahttp(s?):\/\/[^\/]+/
include Attachmentable

View file

@ -3,14 +3,14 @@
#
# Table name: custom_filters
#
# id :bigint not null, primary key
# account_id :bigint
# id :bigint(8) not null, primary key
# account_id :bigint(8)
# expires_at :datetime
# phrase :text default(""), not null
# context :string default([]), not null, is an Array
# created_at :datetime not null
# updated_at :datetime not null
# action :integer default(0), not null
# action :integer default("warn"), not null
#
class CustomFilter < ApplicationRecord

View file

@ -3,8 +3,8 @@
#
# Table name: custom_filter_keywords
#
# id :bigint not null, primary key
# custom_filter_id :bigint not null
# id :bigint(8) not null, primary key
# custom_filter_id :bigint(8) not null
# keyword :text default(""), not null
# whole_word :boolean default(TRUE), not null
# created_at :datetime not null

View file

@ -13,17 +13,19 @@
#
class FeaturedTag < ApplicationRecord
belongs_to :account, inverse_of: :featured_tags, required: true
belongs_to :tag, inverse_of: :featured_tags, required: true
belongs_to :account, inverse_of: :featured_tags
belongs_to :tag, inverse_of: :featured_tags, optional: true # Set after validation
delegate :name, to: :tag, allow_nil: true
validates_associated :tag, on: :create
validates :name, presence: true, on: :create
validate :validate_tag_name, on: :create
validate :validate_featured_tags_limit, on: :create
def name=(str)
self.tag = Tag.find_or_create_by_names(str.strip)&.first
before_create :set_tag
before_create :reset_data
attr_writer :name
def name
tag_id.present? ? tag.name : @name
end
def increment(timestamp)
@ -34,14 +36,23 @@ class FeaturedTag < ApplicationRecord
update(statuses_count: [0, statuses_count - 1].max, last_status_at: account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).where.not(id: deleted_status_id).select(:created_at).first&.created_at)
end
private
def set_tag
self.tag = Tag.find_or_create_by_names(@name)&.first
end
def reset_data
self.statuses_count = account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).count
self.last_status_at = account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).select(:created_at).first&.created_at
end
private
def validate_featured_tags_limit
errors.add(:base, I18n.t('featured_tags.errors.limit')) if account.featured_tags.count >= 10
end
def validate_tag_name
errors.add(:name, :blank) if @name.blank?
errors.add(:name, :invalid) unless @name.match?(/\A(#{Tag::HASHTAG_NAME_RE})\z/i)
end
end

View file

@ -15,6 +15,7 @@
# last_status_at :datetime
# max_score :float
# max_score_at :datetime
# display_name :string
#
class Tag < ApplicationRecord
@ -24,11 +25,12 @@ class Tag < ApplicationRecord
has_many :featured_tags, dependent: :destroy, inverse_of: :tag
HASHTAG_SEPARATORS = "_\u00B7\u200c"
HASHTAG_NAME_RE = "([[:word:]_][[:word:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:word:]#{HASHTAG_SEPARATORS}]*[[:word:]_])|([[:word:]_]*[[:alpha:]][[:word:]_]*)"
HASHTAG_NAME_RE = "([[:alnum:]_][[:alnum:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:alnum:]#{HASHTAG_SEPARATORS}]*[[:alnum:]_])|([[:alnum:]_]*[[:alpha:]][[:alnum:]_]*)"
HASHTAG_RE = /(?:^|[^\/\)\w])#(#{HASHTAG_NAME_RE})/i
validates :name, presence: true, format: { with: /\A(#{HASHTAG_NAME_RE})\z/i }
validate :validate_name_change, if: -> { !new_record? && name_changed? }
validate :validate_display_name_change, if: -> { !new_record? && display_name_changed? }
scope :reviewed, -> { where.not(reviewed_at: nil) }
scope :unreviewed, -> { where(reviewed_at: nil) }
@ -46,6 +48,10 @@ class Tag < ApplicationRecord
name
end
def display_name
attributes['display_name'] || name
end
def usable
boolean_with_default('usable', true)
end
@ -90,8 +96,10 @@ class Tag < ApplicationRecord
class << self
def find_or_create_by_names(name_or_names)
Array(name_or_names).map(&method(:normalize)).uniq { |str| str.mb_chars.downcase.to_s }.map do |normalized_name|
tag = matching_name(normalized_name).first || create(name: normalized_name)
names = Array(name_or_names).map { |str| [normalize(str), str] }.uniq(&:first)
names.map do |(normalized_name, display_name)|
tag = matching_name(normalized_name).first || create(name: normalized_name, display_name: display_name)
yield tag if block_given?
@ -129,7 +137,7 @@ class Tag < ApplicationRecord
end
def normalize(str)
str.gsub(/\A#/, '')
HashtagNormalizer.new.normalize(str)
end
end
@ -138,4 +146,8 @@ class Tag < ApplicationRecord
def validate_name_change
errors.add(:name, I18n.t('tags.does_not_match_previous_name')) unless name_was.mb_chars.casecmp(name.mb_chars).zero?
end
def validate_display_name_change
errors.add(:display_name, I18n.t('tags.does_not_match_previous_name')) unless HashtagNormalizer.new.normalize(display_name).casecmp(name.mb_chars).zero?
end
end

View file

@ -10,11 +10,11 @@ class ActivityPub::HashtagSerializer < ActivityPub::Serializer
end
def name
"##{object.name}"
"##{object.display_name}"
end
def href
if object.class.name == 'FeaturedTag'
if object.instance_of?(FeaturedTag)
short_account_tag_url(object.account, object.tag)
else
tag_url(object)

View file

@ -12,4 +12,8 @@ class REST::FeaturedTagSerializer < ActiveModel::Serializer
def url
short_account_tag_url(object.account, object.tag)
end
def name
object.display_name
end
end

View file

@ -8,4 +8,8 @@ class REST::TagSerializer < ActiveModel::Serializer
def url
tag_url(object)
end
def name
object.display_name
end
end

View file

@ -75,7 +75,7 @@
= link_to short_account_tag_path(@account, featured_tag.tag) do
%h4
= fa_icon 'hashtag'
= featured_tag.name
= featured_tag.display_name
%small
- if featured_tag.last_status_at.nil?
= t('accounts.nothing_here')

View file

@ -28,7 +28,7 @@ RSS::Builder.build do |doc|
end
status.tags.each do |tag|
item.category(tag.name)
item.category(tag.display_name)
end
end
end

View file

@ -2,7 +2,7 @@
= javascript_pack_tag 'admin', async: true, crossorigin: 'anonymous'
- content_for :page_title do
= "##{@tag.name}"
= "##{@tag.display_name}"
- if current_user.can?(:view_dashboard)
- content_for :heading_actions do
@ -53,7 +53,7 @@
= render 'shared/error_messages', object: @tag
.fields-group
= f.input :name, wrapper: :with_block_label
= f.input :display_name, wrapper: :with_block_label
.fields-group
= f.input :usable, as: :boolean, wrapper: :with_label

View file

@ -6,7 +6,7 @@
.pending-account__header
= link_to admin_tag_path(tag.id) do
= fa_icon 'hashtag'
= tag.name
= tag.display_name
%br/

View file

@ -1,12 +1,12 @@
<%= raw t('admin_mailer.new_trends.new_trending_tags.title') %>
<% @tags.each do |tag| %>
- #<%= tag.name %>
- #<%= tag.display_name %>
<%= raw t('admin.trends.tags.usage_comparison', today: tag.history.get(Time.now.utc).accounts, yesterday: tag.history.get(Time.now.utc - 1.day).accounts) %> • <%= t('admin.trends.tags.current_score', score: Trends.tags.score(tag.id).round(2)) %>
<% end %>
<% if @lowest_trending_tag %>
<%= raw t('admin_mailer.new_trends.new_trending_tags.requirements', lowest_tag_name: @lowest_trending_tag.name, lowest_tag_score: Trends.tags.score(@lowest_trending_tag.id).round(2), rank: Trends.tags.options[:review_threshold]) %>
<%= raw t('admin_mailer.new_trends.new_trending_tags.requirements', lowest_tag_name: @lowest_trending_tag.display_name, lowest_tag_score: Trends.tags.score(@lowest_trending_tag.id).round(2), rank: Trends.tags.options[:review_threshold]) %>
<% else %>
<%= raw t('admin_mailer.new_trends.new_trending_tags.no_approved_tags') %>
<% end %>

View file

@ -9,7 +9,7 @@
= render 'shared/error_messages', object: @featured_tag
.fields-group
= f.input :name, wrapper: :with_block_label, hint: safe_join([t('simple_form.hints.featured_tag.name'), safe_join(@recently_used_tags.map { |tag| link_to("##{tag.name}", settings_featured_tags_path(featured_tag: { name: tag.name }), method: :post) }, ', ')], ' ')
= f.input :name, wrapper: :with_block_label, hint: safe_join([t('simple_form.hints.featured_tag.name'), safe_join(@recently_used_tags.map { |tag| link_to("##{tag.display_name}", settings_featured_tags_path(featured_tag: { name: tag.name }), method: :post) }, ', ')], ' ')
.actions
= f.button :button, t('featured_tags.add_new'), type: :submit

View file

@ -1,6 +1,6 @@
= opengraph 'og:site_name', t('about.hosted_on', domain: site_hostname)
= opengraph 'og:url', tag_url(@tag)
= opengraph 'og:type', 'website'
= opengraph 'og:title', "##{@tag.name}"
= opengraph 'og:description', strip_tags(t('about.about_hashtag_html', hashtag: @tag.name))
= opengraph 'og:title', "##{@tag.display_name}"
= opengraph 'og:description', strip_tags(t('about.about_hashtag_html', hashtag: @tag.display_name))
= opengraph 'twitter:card', 'summary'

View file

@ -1,5 +1,5 @@
- content_for :page_title do
= "##{@tag.name}"
= "##{@tag.display_name}"
- content_for :header_tags do
%meta{ name: 'robots', content: 'noindex' }/
@ -9,8 +9,8 @@
= render 'og'
.page-header
%h1= "##{@tag.name}"
%p= t('about.about_hashtag_html', hashtag: @tag.name)
%h1= "##{@tag.display_name}"
%p= t('about.about_hashtag_html', hashtag: @tag.display_name)
#mastodon-timeline{ data: { props: Oj.dump(default_props.merge(hashtag: @tag.name, local: @local)) }}
.notranslate#modal-container

View file

@ -1,6 +1,6 @@
RSS::Builder.build do |doc|
doc.title("##{@tag.name}")
doc.description(I18n.t('rss.descriptions.tag', hashtag: @tag.name))
doc.title("##{@tag.display_name}")
doc.description(I18n.t('rss.descriptions.tag', hashtag: @tag.display_name))
doc.link(tag_url(@tag))
doc.last_build_date(@statuses.first.created_at) if @statuses.any?
doc.generator("Mastodon v#{Mastodon::Version.to_s}")
@ -26,7 +26,7 @@ RSS::Builder.build do |doc|
end
status.tags.each do |tag|
item.category(tag.name)
item.category(tag.display_name)
end
end
end

View file

@ -24,6 +24,7 @@ ActiveSupport::Inflector.inflections(:en) do |inflect|
inflect.acronym 'RSS'
inflect.acronym 'REST'
inflect.acronym 'URL'
inflect.acronym 'ASCII'
inflect.singular 'data', 'data'
end

View file

@ -0,0 +1,5 @@
class AddDisplayNameToTags < ActiveRecord::Migration[6.1]
def change
add_column :tags, :display_name, :string
end
end

View file

@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2022_07_04_024901) do
ActiveRecord::Schema.define(version: 2022_07_10_102457) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
@ -940,6 +940,7 @@ ActiveRecord::Schema.define(version: 2022_07_04_024901) do
t.datetime "last_status_at"
t.float "max_score"
t.datetime "max_score_at"
t.string "display_name"
t.index "lower((name)::text) text_pattern_ops", name: "index_tags_on_name_lower_btree", unique: true
end

View file

@ -0,0 +1,29 @@
# frozen_string_literal: true
require 'rails_helper'
describe HashtagNormalizer do
subject { described_class.new }
describe '#normalize' do
it 'converts full-width Latin characters into basic Latin characters' do
expect(subject.normalize('')).to eq 'synthwave'
end
it 'converts half-width Katakana into Kana characters' do
expect(subject.normalize('シーサイドライナー')).to eq 'シーサイドライナー'
end
it 'converts modified Latin characters into basic Latin characters' do
expect(subject.normalize('BLÅHAJ')).to eq 'blahaj'
end
it 'strips out invalid characters' do
expect(subject.normalize('#foo')).to eq 'foo'
end
it 'keeps valid characters' do
expect(subject.normalize('a·b')).to eq 'a·b'
end
end
end

View file

@ -91,7 +91,7 @@ RSpec.describe Tag, type: :model do
upcase_string = 'abcABCやゆよ'
downcase_string = 'abcabcやゆよ';
tag = Fabricate(:tag, name: downcase_string)
tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
expect(Tag.find_normalized(upcase_string)).to eq tag
end
end
@ -101,12 +101,12 @@ RSpec.describe Tag, type: :model do
upcase_string = 'abcABCやゆよ'
downcase_string = 'abcabcやゆよ';
tag = Fabricate(:tag, name: downcase_string)
tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
expect(Tag.matches_name(upcase_string)).to eq [tag]
end
it 'uses the LIKE operator' do
expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100\\%abc%')]
expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100abc%')]
end
end
@ -115,7 +115,7 @@ RSpec.describe Tag, type: :model do
upcase_string = 'abcABCやゆよ'
downcase_string = 'abcabcやゆよ';
tag = Fabricate(:tag, name: downcase_string)
tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
expect(Tag.matching_name(upcase_string)).to eq [tag]
end
end

View file

@ -892,6 +892,34 @@ const startWorker = async (workerId) => {
return arr;
};
/**
* See app/lib/ascii_folder.rb for the canon definitions
* of these constants
*/
const NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž';
const EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz';
/**
* @param {string} str
* @return {string}
*/
const foldToASCII = str => {
const regex = new RegExp(NON_ASCII_CHARS.split('').join('|'), 'g');
return str.replace(regex, match => {
const index = NON_ASCII_CHARS.indexOf(match);
return EQUIVALENT_ASCII_CHARS[index];
});
};
/**
* @param {string} str
* @return {string}
*/
const normalizeHashtag = str => {
return foldToASCII(str.normalize('NFKC').toLowerCase()).replace(/[^\p{L}\p{N}_\u00b7\u200c]/gu, '');
};
/**
* @param {any} req
* @param {string} name
@ -968,7 +996,7 @@ const startWorker = async (workerId) => {
reject('No tag for stream provided');
} else {
resolve({
channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}`],
channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}`],
options: { needsFiltering: true },
});
}
@ -979,7 +1007,7 @@ const startWorker = async (workerId) => {
reject('No tag for stream provided');
} else {
resolve({
channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}:local`],
channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}:local`],
options: { needsFiltering: true },
});
}