97 lines
		
	
	
	
		
			4.5 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			97 lines
		
	
	
	
		
			4.5 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
| module Twitter
 | |
|   class Regex
 | |
|     REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
 | |
|     REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
 | |
|     REGEXEN[:valid_url_balanced_parens] = /
 | |
|       \(
 | |
|         (?:
 | |
|           #{REGEXEN[:valid_general_url_path_chars]}+
 | |
|           |
 | |
|           # allow one nested level of balanced parentheses
 | |
|           (?:
 | |
|             #{REGEXEN[:valid_general_url_path_chars]}*
 | |
|             \(
 | |
|               #{REGEXEN[:valid_general_url_path_chars]}+
 | |
|             \)
 | |
|             #{REGEXEN[:valid_general_url_path_chars]}*
 | |
|           )
 | |
|         )
 | |
|       \)
 | |
|     /iox
 | |
|     REGEXEN[:valid_url_path] = /(?:
 | |
|       (?:
 | |
|         #{REGEXEN[:valid_general_url_path_chars]}*
 | |
|         (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
 | |
|         #{REGEXEN[:valid_url_path_ending_chars]}
 | |
|       )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
 | |
|     )/iox
 | |
|     REGEXEN[:valid_url] = %r{
 | |
|       (                                                                                     #   $1 total match
 | |
|         (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceding character
 | |
|         (                                                                                   #   $3 URL
 | |
|           ((?:https?|dat|dweb|ipfs|ipns|ssb|gopher):\/\/)?                                  #   $4 Protocol (optional)
 | |
|           (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s)
 | |
|           (?::(#{REGEXEN[:valid_port_number]}))?                                            #   $6 Port number (optional)
 | |
|           (/#{REGEXEN[:valid_url_path]}*)?                                                  #   $7 URL Path and anchor
 | |
|           (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $8 Query String
 | |
|         )
 | |
|       )
 | |
|     }iox
 | |
|     REGEXEN[:validate_nodeid] = /(?:
 | |
|       #{REGEXEN[:validate_url_unreserved]}|
 | |
|       #{REGEXEN[:validate_url_pct_encoded]}|
 | |
|       [!$()*+,;=]
 | |
|     )/iox
 | |
|     REGEXEN[:validate_resid] = /(?:
 | |
|       #{REGEXEN[:validate_url_unreserved]}|
 | |
|       #{REGEXEN[:validate_url_pct_encoded]}|
 | |
|       #{REGEXEN[:validate_url_sub_delims]}
 | |
|     )/iox
 | |
|     REGEXEN[:xmpp_uri] = %r{
 | |
|       (xmpp:)                                                                           # Protocol
 | |
|       (//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)?                     # Authority (optional)
 | |
|       (#{REGEXEN[:validate_nodeid]}+@)?                                                 # Username in path (optional)
 | |
|       (#{REGEXEN[:valid_domain]})                                                       # Domain in path
 | |
|       (/#{REGEXEN[:validate_resid]}+)?                                                  # Resource in path (optional)
 | |
|       (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # Query String
 | |
|     }iox
 | |
|     REGEXEN[:magnet_uri] = %r{
 | |
|       (magnet:)                                                                         # Protocol
 | |
|       (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})  # Query String
 | |
|     }iox
 | |
|     REGEXEN[:valid_extended_uri] = %r{
 | |
|       (                                                                                 #   $1 total match
 | |
|         (#{REGEXEN[:valid_url_preceding_chars]})                                        #   $2 Preceding character
 | |
|         (                                                                               #   $3 URL
 | |
|           (#{REGEXEN[:xmpp_uri]}) | (#{REGEXEN[:magnet_uri]})
 | |
|         )
 | |
|       )
 | |
|     }iox
 | |
|   end
 | |
| 
 | |
|   module Extractor
 | |
|     # Extracts a list of all XMPP and magnet URIs included in the Toot <tt>text</tt> along
 | |
|     # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
 | |
|     # XMPP or magnet URIs an empty array will be returned.
 | |
|     #
 | |
|     # If a block is given then it will be called for each XMPP URI.
 | |
|     def extract_extra_uris_with_indices(text, options = {}) # :yields: uri, start, end
 | |
|       return [] unless text && text.index(":")
 | |
|       urls = []
 | |
| 
 | |
|       text.to_s.scan(Twitter::Regex[:valid_extended_uri]) do
 | |
|         valid_uri_match_data = $~
 | |
| 
 | |
|         start_position = valid_uri_match_data.char_begin(3)
 | |
|         end_position = valid_uri_match_data.char_end(3)
 | |
| 
 | |
|         urls << {
 | |
|           :url => valid_uri_match_data[3],
 | |
|           :indices => [start_position, end_position]
 | |
|         }
 | |
|       end
 | |
|       urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
 | |
|       urls
 | |
|     end
 | |
|   end
 | |
| end
 |