diff --git a/CHANGELOG.md b/CHANGELOG.md index 21d69f8..2372d6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [3.4.4] - 2025-04-04 +### Fixed +* As sometimes, XML parsing errors occur because of characters that aren't valid within XML documents, the library now catches XML parsing errors, tries to find and replace invalid characters (with transliterates or HTML entities) and retries parsing the document. Works best when you additionally install the `voku/portable-ascii` composer package. + ## [3.4.3] - 2025-04-03 ### Fixed * When providing an empty base selector to an `Html` step (`Html::each('')`, `Html::first('')`, `Html::last('')`), it won't fail with an error, but instead log a warning, that it most likely doesn't make sense. diff --git a/composer.json b/composer.json index 54fb8fa..0beaee2 100644 --- a/composer.json +++ b/composer.json @@ -55,7 +55,8 @@ "symfony/process": "^6.0|^7.0" }, "suggest": { - "ext-zlib": "Needed to uncompress compressed responses" + "ext-zlib": "Needed to uncompress compressed responses", + "voku/portable-ascii": "^2.0" }, "funding": [ { diff --git a/src/Steps/Dom/XmlDocument.php b/src/Steps/Dom/XmlDocument.php index 58713f0..3cbbed9 100644 --- a/src/Steps/Dom/XmlDocument.php +++ b/src/Steps/Dom/XmlDocument.php @@ -5,6 +5,8 @@ use Crwlr\Utils\PhpVersion; use DOMNode; use Symfony\Component\DomCrawler\Crawler; +use Throwable; +use voku\helper\ASCII; /** * @method XmlElement|null querySelector(string $selector) @@ -33,9 +35,49 @@ protected function makeChildNodeInstance(object $node): Node protected function makeDocumentInstance(string $source): object { if (PhpVersion::isAtLeast(8, 4)) { - return \Dom\XMLDocument::createFromString($source, LIBXML_NOERROR | LIBXML_NONET); + try { + return \Dom\XMLDocument::createFromString($source, LIBXML_NOERROR | LIBXML_NONET); + } catch (Throwable) { + $source = $this->replaceInvalidXmlCharacters($source); + + try { + return \Dom\XMLDocument::createFromString($source, LIBXML_NOERROR | LIBXML_NONET); + } catch (Throwable) { + } // If it fails again, try it with symfony DOM Crawler as fallback. + } + } + + $crawler = new Crawler($source); + + if ($crawler->count() === 0) { + $source = $this->replaceInvalidXmlCharacters($source); + + $crawler = new Crawler($source); } - return new Crawler($source); + return $crawler; + } + + /** + * Replace characters that aren't valid within XML documents + * + * Sometimes XML parsing errors occur because of characters that aren't valid within XML documents. + * Therefore, this method finds and replaces them with valid alternatives or HTML entities. + * For best results in those cases, please install the voku/portable-ascii composer package. + * + * @param string $value + * @return string + */ + private function replaceInvalidXmlCharacters(string $value): string + { + return preg_replace_callback('/[^\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}]/u', function ($match) { + $replacement = class_exists('voku\helper\ASCII') ? ASCII::to_transliterate($match[0]) : '?'; + + if ($replacement === '?') { + return '&#' . mb_ord($match[0]) . ';'; + } + + return $replacement; + }, $value) ?? $value; } } diff --git a/tests/Steps/Dom/XmlDocumentTest.php b/tests/Steps/Dom/XmlDocumentTest.php index 490331b..e8360ec 100644 --- a/tests/Steps/Dom/XmlDocumentTest.php +++ b/tests/Steps/Dom/XmlDocumentTest.php @@ -68,3 +68,37 @@ expect($anyNodesChecked)->toBeTrue(); }); + +//it('is able to parse documents containing characters that aren\'t valid within XML documents', function (string $char) { +// $xml = << +// +// +// +// +// <![CDATA[foo - {$char} - bar]]> +// +// +// +// +// XML; +// +// $document = new XmlDocument($xml); +// +// $titles = $document->querySelectorAll('channel item title'); +// +// expect($titles)->toBeInstanceOf(NodeList::class) +// ->and($titles->count())->toBe(1) +// ->and($titles->first()?->text())->toStartWith('foo - ') +// ->and($titles->first()?->text())->toEndWith(' - bar'); +//})->with([ +// [mb_chr(0)], +// [mb_chr(6)], +// [mb_chr(12)], +// [mb_chr(20)], +// [mb_chr(31)], +// [mb_chr(128)], +// [mb_chr(157)], +// [mb_chr(195)], +// [mb_chr(253)], +//])->only();