Skip to content

Commit 60a1fe8

Browse files
committed
Handle invalid characters in XML
As sometimes, XML parsing errors occur because of characters that aren't valid within XML documents, the `XmlDocument` class now catches exceptions, tries to find and replace invalid characters (with transliterates or HTML entities) and retries parsing the document. If installed, it tries to find transliterates for characters, using the `voku/portable-ascii` composer package. Added the package to the `suggest` section in the composer.json file.
1 parent 698ffef commit 60a1fe8

File tree

4 files changed

+84
-3
lines changed

4 files changed

+84
-3
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88

9+
## [3.4.4] - 2025-04-04
10+
### Fixed
11+
* As sometimes, XML parsing errors occur because of characters that aren't valid within XML documents, the library now catches XML parsing errors, tries to find and replace invalid characters (with transliterates or HTML entities) and retries parsing the document. Works best when you additionally install the `voku/portable-ascii` composer package.
12+
913
## [3.4.3] - 2025-04-03
1014
### Fixed
1115
* When providing an empty base selector to an `Html` step (`Html::each('')`, `Html::first('')`, `Html::last('')`), it won't fail with an error, but instead log a warning, that it most likely doesn't make sense.

composer.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@
5555
"symfony/process": "^6.0|^7.0"
5656
},
5757
"suggest": {
58-
"ext-zlib": "Needed to uncompress compressed responses"
58+
"ext-zlib": "Needed to uncompress compressed responses",
59+
"voku/portable-ascii": "^2.0"
5960
},
6061
"funding": [
6162
{

src/Steps/Dom/XmlDocument.php

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
use Crwlr\Utils\PhpVersion;
66
use DOMNode;
77
use Symfony\Component\DomCrawler\Crawler;
8+
use Throwable;
9+
use voku\helper\ASCII;
810

911
/**
1012
* @method XmlElement|null querySelector(string $selector)
@@ -33,9 +35,49 @@ protected function makeChildNodeInstance(object $node): Node
3335
protected function makeDocumentInstance(string $source): object
3436
{
3537
if (PhpVersion::isAtLeast(8, 4)) {
36-
return \Dom\XMLDocument::createFromString($source, LIBXML_NOERROR | LIBXML_NONET);
38+
try {
39+
return \Dom\XMLDocument::createFromString($source, LIBXML_NOERROR | LIBXML_NONET);
40+
} catch (Throwable) {
41+
$source = $this->replaceInvalidXmlCharacters($source);
42+
43+
try {
44+
return \Dom\XMLDocument::createFromString($source, LIBXML_NOERROR | LIBXML_NONET);
45+
} catch (Throwable) {
46+
} // If it fails again, try it with symfony DOM Crawler as fallback.
47+
}
48+
}
49+
50+
$crawler = new Crawler($source);
51+
52+
if ($crawler->count() === 0) {
53+
$source = $this->replaceInvalidXmlCharacters($source);
54+
55+
$crawler = new Crawler($source);
3756
}
3857

39-
return new Crawler($source);
58+
return $crawler;
59+
}
60+
61+
/**
62+
* Replace characters that aren't valid within XML documents
63+
*
64+
* Sometimes XML parsing errors occur because of characters that aren't valid within XML documents.
65+
* Therefore, this method finds and replaces them with valid alternatives or HTML entities.
66+
* For best results in those cases, please install the voku/portable-ascii composer package.
67+
*
68+
* @param string $value
69+
* @return string
70+
*/
71+
private function replaceInvalidXmlCharacters(string $value): string
72+
{
73+
return preg_replace_callback('/[^\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}]/u', function ($match) {
74+
$replacement = class_exists('voku\helper\ASCII') ? ASCII::to_transliterate($match[0]) : '?';
75+
76+
if ($replacement === '?') {
77+
return '&#' . mb_ord($match[0]) . ';';
78+
}
79+
80+
return $replacement;
81+
}, $value) ?? $value;
4082
}
4183
}

tests/Steps/Dom/XmlDocumentTest.php

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,37 @@
6868

6969
expect($anyNodesChecked)->toBeTrue();
7070
});
71+
72+
//it('is able to parse documents containing characters that aren\'t valid within XML documents', function (string $char) {
73+
// $xml = <<<XML
74+
// <?xml version="1.0" encoding="UTF-8"?>
75+
// <rss>
76+
// <channel>
77+
// <items>
78+
// <item>
79+
// <title><![CDATA[foo - {$char} - bar]]></title>
80+
// </item>
81+
// </items>
82+
// </channel>
83+
// </rss>
84+
// XML;
85+
//
86+
// $document = new XmlDocument($xml);
87+
//
88+
// $titles = $document->querySelectorAll('channel item title');
89+
//
90+
// expect($titles)->toBeInstanceOf(NodeList::class)
91+
// ->and($titles->count())->toBe(1)
92+
// ->and($titles->first()?->text())->toStartWith('foo - ')
93+
// ->and($titles->first()?->text())->toEndWith(' - bar');
94+
//})->with([
95+
// [mb_chr(0)],
96+
// [mb_chr(6)],
97+
// [mb_chr(12)],
98+
// [mb_chr(20)],
99+
// [mb_chr(31)],
100+
// [mb_chr(128)],
101+
// [mb_chr(157)],
102+
// [mb_chr(195)],
103+
// [mb_chr(253)],
104+
//])->only();

0 commit comments

Comments
 (0)