Kaydet (Commit) 4af729f3 authored tarafından Miklos Vajna's avatar Miklos Vajna

tdf#114428 filter: recognize XHTML with XML declaration as HTML

The problem was the additional

	<?xml version="1.0" encoding="utf-8"?>

XML declaration before the usual

	<!DOCTYPE html ...

line, just ignore it.

Change-Id: I294aae5504b40b42f76da00fef645d0d89009da9
Reviewed-on: https://gerrit.libreoffice.org/46324Reviewed-by: 's avatarMiklos Vajna <vmiklos@collabora.co.uk>
Tested-by: 's avatarJenkins <ci@libreoffice.org>
üst 1d6b85f8
# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
$(eval $(call gb_CppunitTest_CppunitTest,filter_textfilterdetect))
$(eval $(call gb_CppunitTest_use_api,filter_textfilterdetect,\
offapi \
udkapi \
))
$(eval $(call gb_CppunitTest_use_libraries,filter_textfilterdetect, \
comphelper \
cppu \
cppuhelper \
sal \
test \
textfd \
tl \
unotest \
utl \
))
$(eval $(call gb_CppunitTest_add_exception_objects,filter_textfilterdetect, \
filter/qa/unit/textfilterdetect \
))
$(eval $(call gb_CppunitTest_use_ure,filter_textfilterdetect))
$(eval $(call gb_CppunitTest_use_vcl,filter_textfilterdetect))
$(eval $(call gb_CppunitTest_use_components,filter_textfilterdetect,\
configmgr/source/configmgr \
filter/source/textfilterdetect/textfd \
ucb/source/core/ucb1 \
ucb/source/ucp/file/ucpfile1 \
))
$(eval $(call gb_CppunitTest_use_configuration,filter_textfilterdetect))
# vim: set noet sw=4 ts=4:
......@@ -57,6 +57,7 @@ $(eval $(call gb_Module_add_check_targets,filter,\
CppunitTest_filter_xslt \
CppunitTest_filter_priority \
CppunitTest_filter_msfilter \
CppunitTest_filter_textfilterdetect \
))
ifneq ($(DISABLE_CVE_TESTS),TRUE)
......
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Title of document</title>
</head>
<body>hello world</body>
</html>
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#include <com/sun/star/document/XExtendedFilterDetection.hpp>
#include <com/sun/star/io/XInputStream.hpp>
#include <comphelper/processfactory.hxx>
#include <comphelper/propertyvalue.hxx>
#include <test/bootstrapfixture.hxx>
#include <unotools/mediadescriptor.hxx>
#include <unotools/streamwrap.hxx>
using namespace com::sun::star;
namespace
{
/// Test class for PlainTextFilterDetect.
class TextFilterDetectTest : public test::BootstrapFixture
{
public:
void testTdf114428();
CPPUNIT_TEST_SUITE(TextFilterDetectTest);
CPPUNIT_TEST(testTdf114428);
CPPUNIT_TEST_SUITE_END();
};
char const DATA_DIRECTORY[] = "/filter/qa/unit/data/";
void TextFilterDetectTest::testTdf114428()
{
uno::Reference<uno::XComponentContext> xComponentContext
= comphelper::getComponentContext(getMultiServiceFactory());
uno::Reference<document::XExtendedFilterDetection> xDetect(
getMultiServiceFactory()->createInstance("com.sun.star.comp.filters.PlainTextFilterDetect"),
uno::UNO_QUERY);
OUString aURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "tdf114428.xhtml";
SvFileStream aStream(aURL, StreamMode::READ);
uno::Reference<io::XInputStream> xStream(new utl::OStreamWrapper(aStream));
uno::Sequence<beans::PropertyValue> aDescriptor
= { comphelper::makePropertyValue("DocumentService",
OUString("com.sun.star.text.TextDocument")),
comphelper::makePropertyValue("InputStream", xStream),
comphelper::makePropertyValue("TypeName", OUString("generic_HTML")) };
xDetect->detect(aDescriptor);
utl::MediaDescriptor aMediaDesc(aDescriptor);
OUString aFilterName = aMediaDesc.getUnpackedValueOrDefault("FilterName", OUString());
// This was empty, XML declaration caused HTML detect to not handle XHTML.
CPPUNIT_ASSERT_EQUAL(OUString("HTML (StarWriter)"), aFilterName);
}
CPPUNIT_TEST_SUITE_REGISTRATION(TextFilterDetectTest);
}
CPPUNIT_PLUGIN_IMPLEMENT();
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
......@@ -58,6 +58,13 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
// Now check whether the stream begins with a known HTML tag.
enum DetectPhase { BeforeTag, TagOpened, InTagName };
DetectPhase dp = BeforeTag;
/// BeforeDeclaration -> ? -> DeclarationOpened -> > -> BeforeDeclaration.
enum DeclarationPhase
{
BeforeDeclaration,
DeclarationOpened
};
DeclarationPhase eDeclaration = BeforeDeclaration;
const char* pHeader = sHeader.getStr();
const int nLength = sHeader.getLength();
......@@ -66,7 +73,8 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
for ( i = 0; i < nLength; ++i, ++pHeader )
{
char c = *pHeader;
if ( c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' )
if ((c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f')
&& eDeclaration == BeforeDeclaration)
{
if ( dp == TagOpened )
return false; // Invalid: Should start with a tag name
......@@ -84,6 +92,11 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
{
if ( dp == InTagName )
break; // End of tag name reached
else if (eDeclaration == DeclarationOpened)
{
dp = BeforeTag;
eDeclaration = BeforeDeclaration;
}
else
return false; // Invalid: Empty tag or before '<'
}
......@@ -100,8 +113,13 @@ bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
return false; // Invalid: Should start with a tag
else if ( dp == TagOpened )
{
nStartOfTagIndex = i;
dp = InTagName;
if (c == '?' && eDeclaration == BeforeDeclaration)
eDeclaration = DeclarationOpened;
else if (eDeclaration == BeforeDeclaration)
{
nStartOfTagIndex = i;
dp = InTagName;
}
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment