Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions data/custom.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
<mime-info>
<!-- Override audio/mpeg pattern to anchor it at file start -->
<mime-type type="audio/mpeg">
<magic priority="30">
<!-- Anchored version: must match at file start, not mid-buffer -->
<match value="\\A(?:\\x0D\\x0A|\\x00{1,1024})(?:\\xff[\\xe3\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd\\xfe\\xff]|ID3)" type="regex" offset="0"/>
</magic>
</mime-type>

<mime-type type="image/svg+xml">
<sub-class-of type="application/xml" />

Expand Down
1 change: 1 addition & 0 deletions lib/marcel.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

module Marcel
require "marcel/version"
require "marcel/tika_regex"
require "marcel/magic"
require "marcel/mime_type"
end
36 changes: 34 additions & 2 deletions lib/marcel/magic.rb
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,14 @@ def self.magic_match(io, method)

def self.magic_match_io(io, matches, buffer)
matches.any? do |offset, value, children|
# Skip if value is nil (e.g., invalid regex pattern - it was meant for Java after all)
next false if value.nil?

match =
if value
if Range === offset
if value.is_a?(Regexp)
match_regex(io, offset, value, buffer)
elsif Range === offset
io.read(offset.begin, buffer)
x = io.read(offset.end - offset.begin + value.bytesize, buffer)
x && x.include?(value)
Expand All @@ -143,6 +148,33 @@ def self.magic_match_io(io, matches, buffer)
end
end

private_class_method :magic_match, :magic_match_io
def self.match_regex(io, offset, regexp, buffer)
start = offset.is_a?(Range) ? offset.begin : offset
io.read(start, buffer) if start > 0
data = io.read(256, buffer)
return false unless data

# I know, I know... this is awful, but the patterns come from Apache Tika
# and we are getting warnings about character class overlaps, so we'll
# suppress warnings for this match call.
# I'm open to better ideas.
begin
old_verbose = $VERBOSE
$VERBOSE = nil

# For regex patterns, simply match within the data buffer
# The patterns themselves should be designed to match appropriately
data.match?(regexp)
ensure
$VERBOSE = old_verbose
end

# we need to catch all exceptions here because TruffleRuby raises Polyglot::ForeignException
rescue Exception => e
warn "Marcel::Magic.match_regex: error matching #{regexp.inspect}: #{e.message}"
false
end

private_class_method :magic_match, :magic_match_io, :match_regex
end
end
1 change: 0 additions & 1 deletion lib/marcel/mime_type/definitions.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# frozen_string_literal: true

Marcel::MimeType.extend "text/plain", extensions: %w( txt asc )
Marcel::MimeType.extend "text/html", magic: [[0..64, "<!DOCTYPE HTML"], [0..64, "<!DOCTYPE html"], [0..64, "<!doctype HTML"], [0..64, "<!doctype html"]]

Marcel::MimeType.extend "application/illustrator", parents: "application/pdf"
Marcel::MimeType.extend "image/vnd.adobe.photoshop", magic: [[0, "8BPS"]], extensions: %w( psd psb )
Expand Down
50 changes: 26 additions & 24 deletions lib/marcel/tables.rb

Large diffs are not rendered by default.

61 changes: 61 additions & 0 deletions lib/marcel/tika_regex.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# frozen_string_literal: true

module Marcel
module TikaRegex
# Apache Tika uses Java regex syntax, which has some differences from Ruby:
# - (?s) flag in Java is a mode which makes . match newlines
# In Ruby, this is equivalent to the multiline flag
# - Java uses double-escaped sequences like \\d, \\x00, \\u0041 in XML
# These need to be converted to Ruby's single-escaped format: \d, \x00, \u0041
# - Naturally, some Java regex features are not supported in Ruby (e.g., variable-length lookbehinds)
#
# This method handles the conversion and gracefully returns nil for incompatible patterns.
#
# @param pattern [String] The Tika regex pattern string
# @return [Regexp, nil] The compiled Ruby Regexp, or nil if the pattern is incompatible
def self.to_ruby_regexp(pattern)
return nil if pattern.nil? || pattern.empty?

processed = pattern.dup
flags = 0

# Converting Java's (?s) dotall flag to Ruby's multiline
if processed.include?('(?s)')
processed = processed.gsub('(?s)', '')
flags |= Regexp::MULTILINE
end

# Convert Java-style double-escaped sequences to Ruby single-escaped format
# This is more complex than a simple gsub because we need to handle:
# - \\xHH -> \xHH (hex byte)
# - \\uHHHH -> \uHHHH (unicode)
# - \\OOO -> \xHH (convert octal to hex to avoid backreference ambiguity in TruffleRuby)
# - \\d, \\w, \\s, etc. -> \d, \w, \s (character classes)
# - \\[, \\], \\{, \\}, etc. -> \[, \], \{, \} (literal characters)
#
# We process these specifically to avoid breaking the regex structure
processed = processed.gsub(/\\\\(x[0-9a-fA-F]{2})/, '\\\\\1') # \\xHH -> \xHH
.gsub(/\\\\(u[0-9a-fA-F]{4})/, '\\\\\1') # \\uHHHH -> \uHHHH
.gsub(/\\\\([0-7]{1,3})/) { "\\x#{$1.to_i(8).to_s(16).rjust(2, '0')}" } # \\OOO -> \xHH (octal to hex so that TruffleRuby doesn't think it's a backreference)
.gsub(/\\\\([WDS])/i, '\\\\\1') # \\d etc. -> \d
.gsub(/\\\\([farbentv])/, '\\\\\1') # \\n etc. -> \n
.gsub(/\\\\([()\[\]{}|*+?.^$\\])/, '\\\\\1') # \\[ etc. -> \[

# Force binary encoding to handle binary escape sequences like \xff
processed = processed.force_encoding(Encoding::BINARY)

# I know, I know... this is awful, but the patterns come from Apache Tika
# and we are getting warnings about character class overlaps, so we'll
# suppress warnings for this Regexp compilation.
# I'm open to better ideas.
old_verbose = $VERBOSE
$VERBOSE = nil

Regexp.new(processed, flags).freeze
rescue RegexpError
nil
ensure
$VERBOSE = old_verbose
end
end
end
18 changes: 17 additions & 1 deletion script/generate_tables.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Copyright (c) 2011 Daniel Mendler. Available at https://github.com/mimemagicrb/mimemagic.

require 'nokogiri'
require_relative '../lib/marcel/tika_regex'

class String
alias inspect_old inspect
Expand All @@ -27,6 +28,16 @@ def inspect
end
end

class RegexString
def initialize(pattern)
@pattern = pattern
end

def inspect
"r[#{@pattern.inspect}]"
end
end

def str2int(s)
return s.to_i(16) if s[0..1].downcase == '0x'
return s.to_i(8) if s[0..0].downcase == '0'
Expand All @@ -39,6 +50,8 @@ def binary_strings(object)
object.map { |o| binary_strings(o) }
when String
BinaryString.new(object)
when RegexString
object
when Numeric, Range, nil
object
else
Expand All @@ -65,6 +78,8 @@ def get_matches(mime, parent)

offset = offset.size == 2 ? offset[0]..offset[1] : offset[0]
case type
when 'regex'
value = RegexString.new(value)
when 'string', 'stringignorecase'
value.gsub!(/\A0x([0-9a-f]+)\z/i) { [$1].pack('H*') }
value.gsub!(/\\(x[\dA-Fa-f]{1,2}|0\d{1,3}|\d{1,3}|.)/) { eval("\"\\#{$1}\"") }
Expand Down Expand Up @@ -231,11 +246,12 @@ def get_matches(mime, parent)
end
puts " }"
puts " b = Hash.new { |h, k| h[k] = k.b.freeze }"
puts " r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) }"
puts " # @private"
puts " # :nodoc:"
puts " MAGIC = ["
magics.each do |priority, type, matches|
puts " ['#{type}', #{binary_strings(matches).inspect}],"
puts " ['#{type.strip}', #{binary_strings(matches).inspect}],"
end
puts " ]"
puts "end"
Binary file not shown.
Binary file added test/fixtures/magic/application/x-bzip2/bzip2.bz2
Binary file not shown.
130 changes: 130 additions & 0 deletions test/magic_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,134 @@ class Marcel::MimeType::MagicTest < Marcel::TestCase
assert Marcel::Magic.child?('text/csv', 'text/plain')
refute Marcel::Magic.child?('text/plain', 'text/csv')
end

test "none of the regex patterns should match random test data" do
ignore_list = %w( application/x-dbf )

extract_regexes = lambda do |matching_rules, collected = []|
matching_rules.each do |offset, value, children|
collected << [offset, value] if value.is_a?(Regexp)
extract_regexes.call(children, collected) if children
end
collected
end

# Use a test string that's very unlikely to match any file format regex
# Using only high Unicode characters and very specific patterns
test_data = "🇨🇭 \xFF\xFE\x03\x05\x06🧀 cheese\x06\x07\x03"

Marcel::MAGIC.each do |type, matching_rules|
next if ignore_list.include?(type)
regexes = extract_regexes.call(matching_rules)

regexes.each do |offset, regex|
buffer = (+"").encode(Encoding::BINARY)

result = Marcel::Magic.send(:match_regex, StringIO.new(test_data), offset, regex, buffer)

assert_equal false, result, "Test data unexpectedly matched a file format regexp (#{type}, #{regex.inspect})"
end
end
end

test "nested match: parent AND child must both match" do
# Rule: offset 0 matches "AAA" AND offset 3 matches "BBB"
# This should match "AAABBB" but not "AAA" alone
test_rules = [
[0, "AAA".b, [[3, "BBB".b]]]
]

buffer = (+"").encode(Encoding::BINARY)

# Should match when both parent and child match
io1 = StringIO.new("AAABBB")
assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer),
"Should match when parent and child both match"

# Should NOT match when parent matches but child doesn't
io2 = StringIO.new("AAAXXX")
refute Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer),
"Should not match when parent matches but child doesn't"
end

test "sibling matches use OR logic" do
# Two sibling rules: either can match
# Rule 1: offset 0 matches "XXX"
# Rule 2: offset 0 matches "YYY"
test_rules = [
[0, "XXX".b],
[0, "YYY".b]
]

buffer = (+"").encode(Encoding::BINARY)

# Should match via first sibling
io1 = StringIO.new("XXX")
assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer),
"Should match via first sibling rule"

# Should match via second sibling
io2 = StringIO.new("YYY")
assert Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer),
"Should match via second sibling rule"

# Should NOT match when no sibling matches
io3 = StringIO.new("ZZZ")
refute Marcel::Magic.send(:magic_match_io, io3, test_rules, buffer),
"Should not match when no sibling rule matches"
end

test "parent with multiple child alternatives (OR)" do
# Test complex nested structure: parent AND (child1 OR child2)
# Parent at offset 0 matches "ROOT"
# Child option 1: offset 4 matches "OPT1"
# Child option 2: offset 4 matches "OPT2"
test_rules = [
[0, "ROOT".b, [
[4, "OPT1".b], # First child option
[4, "OPT2".b] # Second child option (sibling OR)
]]
]

buffer = (+"").encode(Encoding::BINARY)

# Should match when parent and first child match
io1 = StringIO.new("ROOTOPT1")
assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer),
"Should match when parent and first child match"

# Should match when parent and second child match
io2 = StringIO.new("ROOTOPT2")
assert Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer),
"Should match when parent and second child match"

# Should NOT match when parent matches but no child matches
io3 = StringIO.new("ROOTXXXX")
refute Marcel::Magic.send(:magic_match_io, io3, test_rules, buffer),
"Should not match when parent matches but no child matches"
end

test "complex nested structure with multiple levels" do
# Parent AND (Child AND Grandchild)
# offset 0: "AAA", offset 3: "BBB", offset 6: "CCC"
test_rules = [
[0, "AAA".b, [
[3, "BBB".b, [
[6, "CCC".b]
]]
]]
]

buffer = (+"").encode(Encoding::BINARY)

# Should match when all levels match
io1 = StringIO.new("AAABBBCCC")
assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer),
"Should match when all nested levels match"

# Should NOT match when grandchild doesn't match
io2 = StringIO.new("AAABBBXXX")
refute Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer),
"Should not match when deepest child doesn't match"
end
end
Loading