Skip to content

Commit 8ef7502

Browse files
naitohkou
andauthored
Add IOSource#match? method (#216)
## Why? `StringScanner#match?` is faster than `StringScanner#check`. See: ruby/strscan#111 ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 18.819 19.362 32.846 34.708 i/s - 100.000 times in 5.313905s 5.164791s 3.044500s 2.881200s sax 28.188 29.982 48.386 52.554 i/s - 100.000 times in 3.547597s 3.335304s 2.066732s 1.902809s pull 31.962 33.902 57.868 60.662 i/s - 100.000 times in 3.128689s 2.949690s 1.728071s 1.648467s stream 31.436 33.030 52.808 56.647 i/s - 100.000 times in 3.181095s 3.027574s 1.893635s 1.765304s Comparison: dom after(YJIT): 34.7 i/s before(YJIT): 32.8 i/s - 1.06x slower after: 19.4 i/s - 1.79x slower before: 18.8 i/s - 1.84x slower sax after(YJIT): 52.6 i/s before(YJIT): 48.4 i/s - 1.09x slower after: 30.0 i/s - 1.75x slower before: 28.2 i/s - 1.86x slower pull after(YJIT): 60.7 i/s before(YJIT): 57.9 i/s - 1.05x slower after: 33.9 i/s - 1.79x slower before: 32.0 i/s - 1.90x slower stream after(YJIT): 56.6 i/s before(YJIT): 52.8 i/s - 1.07x slower after: 33.0 i/s - 1.72x slower before: 31.4 i/s - 1.80x slower ``` - YJIT=ON : 1.05x - 1.09x faster - YJIT=OFF : 1.02x - 1.06x faster --------- Co-authored-by: Sutou Kouhei <kou@clear-code.com>
1 parent 6a8c041 commit 8ef7502

File tree

2 files changed

+77
-42
lines changed

2 files changed

+77
-42
lines changed

lib/rexml/parsers/baseparser.rb

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -269,10 +269,10 @@ def pull_event
269269
@source.ensure_buffer
270270
if @document_status == nil
271271
start_position = @source.position
272-
if @source.match("<?", true)
272+
if @source.match?("<?", true)
273273
return process_instruction
274-
elsif @source.match("<!", true)
275-
if @source.match("--", true)
274+
elsif @source.match?("<!", true)
275+
if @source.match?("--", true)
276276
md = @source.match(/(.*?)-->/um, true)
277277
if md.nil?
278278
raise REXML::ParseException.new("Unclosed comment", @source)
@@ -281,10 +281,10 @@ def pull_event
281281
raise REXML::ParseException.new("Malformed comment", @source)
282282
end
283283
return [ :comment, md[1] ]
284-
elsif @source.match("DOCTYPE", true)
284+
elsif @source.match?("DOCTYPE", true)
285285
base_error_message = "Malformed DOCTYPE"
286-
unless @source.match(/\s+/um, true)
287-
if @source.match(">")
286+
unless @source.match?(/\s+/um, true)
287+
if @source.match?(">")
288288
message = "#{base_error_message}: name is missing"
289289
else
290290
message = "#{base_error_message}: invalid name"
@@ -293,10 +293,10 @@ def pull_event
293293
raise REXML::ParseException.new(message, @source)
294294
end
295295
name = parse_name(base_error_message)
296-
if @source.match(/\s*\[/um, true)
296+
if @source.match?(/\s*\[/um, true)
297297
id = [nil, nil, nil]
298298
@document_status = :in_doctype
299-
elsif @source.match(/\s*>/um, true)
299+
elsif @source.match?(/\s*>/um, true)
300300
id = [nil, nil, nil]
301301
@document_status = :after_doctype
302302
@source.ensure_buffer
@@ -308,9 +308,9 @@ def pull_event
308308
# For backward compatibility
309309
id[1], id[2] = id[2], nil
310310
end
311-
if @source.match(/\s*\[/um, true)
311+
if @source.match?(/\s*\[/um, true)
312312
@document_status = :in_doctype
313-
elsif @source.match(/\s*>/um, true)
313+
elsif @source.match?(/\s*>/um, true)
314314
@document_status = :after_doctype
315315
@source.ensure_buffer
316316
else
@@ -320,7 +320,7 @@ def pull_event
320320
end
321321
args = [:start_doctype, name, *id]
322322
if @document_status == :after_doctype
323-
@source.match(/\s*/um, true)
323+
@source.match?(/\s*/um, true)
324324
@stack << [ :end_doctype ]
325325
end
326326
return args
@@ -331,14 +331,14 @@ def pull_event
331331
end
332332
end
333333
if @document_status == :in_doctype
334-
@source.match(/\s*/um, true) # skip spaces
334+
@source.match?(/\s*/um, true) # skip spaces
335335
start_position = @source.position
336-
if @source.match("<!", true)
337-
if @source.match("ELEMENT", true)
336+
if @source.match?("<!", true)
337+
if @source.match?("ELEMENT", true)
338338
md = @source.match(/(.*?)>/um, true)
339339
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
340340
return [ :elementdecl, "<!ELEMENT" + md[1] ]
341-
elsif @source.match("ENTITY", true)
341+
elsif @source.match?("ENTITY", true)
342342
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
343343
unless match_data
344344
raise REXML::ParseException.new("Malformed entity declaration", @source)
@@ -370,7 +370,7 @@ def pull_event
370370
end
371371
match << '%' if ref
372372
return match
373-
elsif @source.match("ATTLIST", true)
373+
elsif @source.match?("ATTLIST", true)
374374
md = @source.match(Private::ATTLISTDECL_END, true)
375375
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
376376
element = md[1]
@@ -390,10 +390,10 @@ def pull_event
390390
end
391391
end
392392
return [ :attlistdecl, element, pairs, contents ]
393-
elsif @source.match("NOTATION", true)
393+
elsif @source.match?("NOTATION", true)
394394
base_error_message = "Malformed notation declaration"
395-
unless @source.match(/\s+/um, true)
396-
if @source.match(">")
395+
unless @source.match?(/\s+/um, true)
396+
if @source.match?(">")
397397
message = "#{base_error_message}: name is missing"
398398
else
399399
message = "#{base_error_message}: invalid name"
@@ -405,7 +405,7 @@ def pull_event
405405
id = parse_id(base_error_message,
406406
accept_external_id: true,
407407
accept_public_id: true)
408-
unless @source.match(/\s*>/um, true)
408+
unless @source.match?(/\s*>/um, true)
409409
message = "#{base_error_message}: garbage before end >"
410410
raise REXML::ParseException.new(message, @source)
411411
end
@@ -419,7 +419,7 @@ def pull_event
419419
end
420420
elsif match = @source.match(/(%.*?;)\s*/um, true)
421421
return [ :externalentity, match[1] ]
422-
elsif @source.match(/\]\s*>/um, true)
422+
elsif @source.match?(/\]\s*>/um, true)
423423
@document_status = :after_doctype
424424
return [ :end_doctype ]
425425
end
@@ -428,16 +428,16 @@ def pull_event
428428
end
429429
end
430430
if @document_status == :after_doctype
431-
@source.match(/\s*/um, true)
431+
@source.match?(/\s*/um, true)
432432
end
433433
begin
434434
start_position = @source.position
435-
if @source.match("<", true)
435+
if @source.match?("<", true)
436436
# :text's read_until may remain only "<" in buffer. In the
437437
# case, buffer is empty here. So we need to fill buffer
438438
# here explicitly.
439439
@source.ensure_buffer
440-
if @source.match("/", true)
440+
if @source.match?("/", true)
441441
@namespaces_restore_stack.pop
442442
last_tag = @tags.pop
443443
md = @source.match(Private::CLOSE_PATTERN, true)
@@ -452,7 +452,7 @@ def pull_event
452452
raise REXML::ParseException.new(message, @source)
453453
end
454454
return [ :end_element, last_tag ]
455-
elsif @source.match("!", true)
455+
elsif @source.match?("!", true)
456456
md = @source.match(/([^>]*>)/um)
457457
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
458458
raise REXML::ParseException.new("Malformed node", @source) unless md
@@ -470,7 +470,7 @@ def pull_event
470470
end
471471
raise REXML::ParseException.new( "Declarations can only occur "+
472472
"in the doctype declaration.", @source)
473-
elsif @source.match("?", true)
473+
elsif @source.match?("?", true)
474474
return process_instruction
475475
else
476476
# Get the next tag
@@ -651,7 +651,7 @@ def need_source_encoding_update?(xml_declaration_encoding)
651651
def parse_name(base_error_message)
652652
md = @source.match(Private::NAME_PATTERN, true)
653653
unless md
654-
if @source.match(/\S/um)
654+
if @source.match?(/\S/um)
655655
message = "#{base_error_message}: invalid name"
656656
else
657657
message = "#{base_error_message}: name is missing"
@@ -693,34 +693,34 @@ def parse_id_invalid_details(accept_external_id:,
693693
accept_public_id:)
694694
public = /\A\s*PUBLIC/um
695695
system = /\A\s*SYSTEM/um
696-
if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
697-
if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
696+
if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
697+
if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
698698
return "public ID literal is missing"
699699
end
700-
unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
700+
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
701701
return "invalid public ID literal"
702702
end
703703
if accept_public_id
704-
if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
704+
if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
705705
return "system ID literal is missing"
706706
end
707-
unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
707+
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
708708
return "invalid system literal"
709709
end
710710
"garbage after system literal"
711711
else
712712
"garbage after public ID literal"
713713
end
714-
elsif accept_external_id and @source.match(/#{system}/um)
715-
if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
714+
elsif accept_external_id and @source.match?(/#{system}/um)
715+
if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
716716
return "system literal is missing"
717717
end
718-
unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
718+
unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
719719
return "invalid system literal"
720720
end
721721
"garbage after system literal"
722722
else
723-
unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
723+
unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
724724
return "invalid ID type"
725725
end
726726
"ID type is missing"
@@ -729,15 +729,15 @@ def parse_id_invalid_details(accept_external_id:,
729729

730730
def process_instruction
731731
name = parse_name("Malformed XML: Invalid processing instruction node")
732-
if @source.match(/\s+/um, true)
732+
if @source.match?(/\s+/um, true)
733733
match_data = @source.match(/(.*?)\?>/um, true)
734734
unless match_data
735735
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
736736
end
737737
content = match_data[1]
738738
else
739739
content = nil
740-
unless @source.match("?>", true)
740+
unless @source.match?("?>", true)
741741
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
742742
end
743743
end
@@ -767,17 +767,17 @@ def parse_attributes(prefixes)
767767
expanded_names = {}
768768
closed = false
769769
while true
770-
if @source.match(">", true)
770+
if @source.match?(">", true)
771771
return attributes, closed
772-
elsif @source.match("/>", true)
772+
elsif @source.match?("/>", true)
773773
closed = true
774774
return attributes, closed
775775
elsif match = @source.match(QNAME, true)
776776
name = match[1]
777777
prefix = match[2]
778778
local_part = match[3]
779779

780-
unless @source.match(/\s*=\s*/um, true)
780+
unless @source.match?(/\s*=\s*/um, true)
781781
message = "Missing attribute equal: <#{name}>"
782782
raise REXML::ParseException.new(message, @source)
783783
end
@@ -793,7 +793,7 @@ def parse_attributes(prefixes)
793793
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
794794
raise REXML::ParseException.new(message, @source)
795795
end
796-
@source.match(/\s*/um, true)
796+
@source.match?(/\s*/um, true)
797797
if prefix == "xmlns"
798798
if local_part == "xml"
799799
if value != Private::XML_PREFIXED_NAMESPACE

lib/rexml/source.rb

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,16 @@ def scan(pattern)
1818
pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
1919
super(pattern)
2020
end
21+
22+
def match?(pattern)
23+
pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
24+
super(pattern)
25+
end
26+
27+
def skip(pattern)
28+
pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
29+
super(pattern)
30+
end
2131
end
2232
end
2333
using StringScannerCheckScanString
@@ -126,6 +136,14 @@ def match(pattern, cons=false)
126136
end
127137
end
128138

139+
def match?(pattern, cons=false)
140+
if cons
141+
!@scanner.skip(pattern).nil?
142+
else
143+
!@scanner.match?(pattern).nil?
144+
end
145+
end
146+
129147
def position
130148
@scanner.pos
131149
end
@@ -267,6 +285,23 @@ def match( pattern, cons=false )
267285
md.nil? ? nil : @scanner
268286
end
269287

288+
def match?( pattern, cons=false )
289+
# To avoid performance issue, we need to increase bytes to read per scan
290+
min_bytes = 1
291+
while true
292+
if cons
293+
n_matched_bytes = @scanner.skip(pattern)
294+
else
295+
n_matched_bytes = @scanner.match?(pattern)
296+
end
297+
return true if n_matched_bytes
298+
return false if pattern.is_a?(String)
299+
return false if @source.nil?
300+
return false unless read(nil, min_bytes)
301+
min_bytes *= 2
302+
end
303+
end
304+
270305
def empty?
271306
super and ( @source.nil? || @source.eof? )
272307
end

0 commit comments

Comments
 (0)