@ -4,9 +4,25 @@ class SpamCheck
include Redisable
include ActionView :: Helpers :: TextHelper
# Threshold over which two Nilsimsa values are considered
# to refer to the same text
NILSIMSA_COMPARE_THRESHOLD = 95
NILSIMSA_MIN_SIZE = 10
EXPIRE_SET_AFTER = 1 . week . seconds
# Nilsimsa doesn't work well on small inputs, so below
# this size, we check only for exact matches with MD5
NILSIMSA_MIN_SIZE = 10
# How long to keep the trail of digests between updates,
# there is no reason to store it forever
EXPIRE_SET_AFTER = 1 . week . seconds
# How many digests to keep in an account's trail. If it's
# too small, spam could rotate around different message templates
MAX_TRAIL_SIZE = 10
# How many detected duplicates to allow through before
# considering the message as spam
THRESHOLD = 5
def initialize ( status )
@account = status . account
@ -21,9 +37,9 @@ class SpamCheck
if insufficient_data?
false
elsif nilsimsa?
any_other_digest ?( 'nilsimsa' ) { | _ , other_digest | nilsimsa_compare_value ( digest , other_digest ) > = NILSIMSA_COMPARE_THRESHOLD }
digests_over_threshold ?( 'nilsimsa' ) { | _ , other_digest | nilsimsa_compare_value ( digest , other_digest ) > = NILSIMSA_COMPARE_THRESHOLD }
else
any_other_digest ?( 'md5' ) { | _ , other_digest | other_digest == digest }
digests_over_threshold ?( 'md5' ) { | _ , other_digest | other_digest == digest }
end
end
@ -38,7 +54,7 @@ class SpamCheck
# get the correct status ID back, we have to save it in the string value
redis . zadd ( redis_key , @status . id , digest_with_algorithm )
redis . zremrangebyrank ( redis_key , '0' , '-10' )
redis . zremrangebyrank ( redis_key , 0 , - ( MAX_TRAIL_SIZE + 1 ) )
redis . expire ( redis_key , EXPIRE_SET_AFTER )
end
@ -78,6 +94,20 @@ class SpamCheck
end
end
class << self
def perform ( status )
spam_check = new ( status )
return if spam_check . skip?
if spam_check . spam?
spam_check . flag!
else
spam_check . remember!
end
end
end
private
def disabled?
@ -149,14 +179,14 @@ class SpamCheck
redis . zrange ( redis_key , 0 , - 1 )
end
def any_other_digest ?( filter_algorithm )
other_digests . any? do | record |
def digests_over_threshold ?( filter_algorithm )
other_digests . select do | record |
algorithm , other_digest , status_id = record . split ( ':' )
next unless algorithm == filter_algorithm
yield algorithm , other_digest , status_id
end
end . size > = THRESHOLD
end
def matching_status_ids