diff --git a/.github/workflows/release_python.yml b/.github/workflows/release_python.yml
index 8a41640..9da3e91 100644
--- a/.github/workflows/release_python.yml
+++ b/.github/workflows/release_python.yml
@@ -1,6 +1,6 @@
 # This file was autogenerated by maturin v1.6.0 using:
 #    maturin generate-ci github -o ../../.github/workflows/release_pyton_pytest.yml --pytest
-# 
+#
 # Then adapted to the project
 #
 name: CI
@@ -38,7 +38,7 @@ jobs:
           python-version: '3.8'
 
       # On linux we don't use graalvm/setup-graalvm@v1.2.5 action to install graalvm because it will install it
-      # on the runner machine and on linux the build will happen inside a manylinux docker. 
+      # on the runner machine and on linux the build will happen inside a manylinux docker.
       # Instead, we use a script to install graalvm inside the docker container
       # the script is launched by setting the before-script-linux config option of the maturin action
       - name: Build wheels
@@ -60,7 +60,7 @@ jobs:
         with:
           name: wheels-linux-${{ matrix.platform.target }}
           path: bindings/extractous-python/dist
-      
+
       - name: pytest
         if: ${{ startsWith(matrix.platform.target, 'x86_64') }}
         shell: bash
@@ -70,7 +70,7 @@ jobs:
           python3 -m venv .venv
           source .venv/bin/activate
           pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
-          pip install pytest scikit-learn
+          pip install pytest scikit-learn lxml
           cd bindings/extractous-python
           pytest -s
 
@@ -85,7 +85,7 @@ jobs:
             apt-get update
             apt-get install tesseract-ocr tesseract-ocr-deu tesseract-ocr-ara
             apt-get install -y --no-install-recommends python3 python3-pip
-            pip3 install -U pip pytest scikit-learn
+            pip3 install -U pip pytest scikit-learn lxml
           run: |
             set -e
             pip3 install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
@@ -129,7 +129,7 @@ jobs:
           python -m venv .venv
           .venv\Scripts\activate.bat
           pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
-          pip install pytest scikit-learn
+          pip install pytest scikit-learn lxml
           cd bindings\extractous-python
           pytest -s .
 
@@ -186,7 +186,7 @@ jobs:
           python3 -m venv .venv
           source .venv/bin/activate
           pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
-          pip install pytest scikit-learn
+          pip install pytest scikit-learn lxml
           cd bindings/extractous-python
           pytest -s
 
@@ -206,7 +206,7 @@ jobs:
           name: wheels-sdist
           path: bindings/extractous-python/dist
 
-  # Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/        
+  # Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
   # We use 2 actions one to publish on PyPi on tag pushes to main brnach and the other to publish on TestPyPi on any push
   publish-to-testpypi:
     name: Publish to TestPyPI
diff --git a/README.md b/README.md
index 78ac3da..da0f893 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,8 @@ from extractous import Extractor
 # Create a new extractor
 extractor = Extractor()
 extractor = extractor.set_extract_string_max_length(1000)
+# if you need an xml
+# extractor = extractor.set_xml_output(True)
 
 # Extract text from a file
 result, metadata = extractor.extract_file_to_string("README.md")
@@ -125,6 +127,8 @@ use extractous::Extractor;
 fn main() {
     // Create a new extractor. Note it uses a consuming builder pattern
     let mut extractor = Extractor::new().set_extract_string_max_length(1000);
+    // if you need an xml
+    // extractor = extractor.set_parse_string_as_xml(false);
 
     // Extract text from a file
     let (text, metadata) = extractor.extract_file_to_string("README.md").unwrap();
diff --git a/bindings/extractous-python/README.md b/bindings/extractous-python/README.md
index 2f3d558..fab132d 100644
--- a/bindings/extractous-python/README.md
+++ b/bindings/extractous-python/README.md
@@ -20,7 +20,9 @@ from extractous import Extractor
 
 # Create a new extractor
 extractor = Extractor()
-extractor.set_extract_string_max_length(1000)
+extractor = extractor.set_extract_string_max_length(1000)
+# if you need an xml
+# extractor = extractor.set_xml_output(True)
 
 # Extract text from a file
 result, metadata = extractor.extract_file_to_string("README.md")
diff --git a/bindings/extractous-python/pyproject.toml b/bindings/extractous-python/pyproject.toml
index 3ce2b8f..16f1805 100644
--- a/bindings/extractous-python/pyproject.toml
+++ b/bindings/extractous-python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "extractous"
-version = '0.2.0'
+version = '0.2.1'
 classifiers = [
   "Programming Language :: Rust",
   "Programming Language :: Python :: Implementation :: CPython",
@@ -50,11 +50,11 @@ module-name = "extractous._extractous"
 python-source = "python"
 
 # Setting skip-auditwheel=true is very important to instruct maturin to not run its auditwheel flow
-# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs 
+# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs
 # By skipping the wheel, we just get a plain _extracts_rs* lib, and we have to:
 #   * bundle our graalvm libs using the below include [] directive
 #   * change the RPATH of _extracts_rs* lib to be able to properly find the bundled graalvm libs
-skip-auditwheel=true 
+skip-auditwheel=true
 
 # This tells cargo to set the RPATH for the private module built lib _extractous.abi3.so
 # Set the RPATH to $ORIGIN because the graalvm libs will be bundled in the same dir as the _extractous.abi3.so
@@ -62,7 +62,7 @@ rustc-args = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"]
 
 # Maturin include command will start looking from the python/extractous folder
 # so to include the graalvm libs the rust build script must copy them to python/extractous folder
-include = [ 
+include = [
   {path = "**/*.so", format = ["wheel"]},
   {path = "**/*.dylib", format = ["wheel"]},
   {path = "**/*.dll", format = ["wheel"]}
diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
index 3ccfdd6..0a9f121 100644
--- a/bindings/extractous-python/src/extractor.rs
+++ b/bindings/extractous-python/src/extractor.rs
@@ -136,6 +136,12 @@ impl Extractor {
         Ok(Self(inner))
     }
 
+    /// Set the configuration for the parse as xml
+    pub fn set_xml_output(&self, xml_output: bool) -> PyResult<Self> {
+        let inner = self.0.clone().set_xml_output(xml_output);
+        Ok(Self(inner))
+    }
+
     /// Extracts text from a file path. Returns a tuple with stream of the extracted text
     /// the stream is decoded using the extractor's `encoding` and tika metadata.
     pub fn extract_file<'py>(
diff --git a/bindings/extractous-python/tests/test_extract_bytes.py b/bindings/extractous-python/tests/test_extract_bytes.py
index df0ca07..a40a77c 100644
--- a/bindings/extractous-python/tests/test_extract_bytes.py
+++ b/bindings/extractous-python/tests/test_extract_bytes.py
@@ -3,7 +3,7 @@
 
 from extractous import Extractor
 from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \
-    is_expected_metadata_contained
+    is_expected_metadata_contained, extract_body_text
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9, 0.8),
@@ -49,6 +49,36 @@ def test_extract_bytes_to_string(file_name, target_dist, metadata_dist):
     assert percent_similarity >= metadata_dist, \
         f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
 
+@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
+def test_extract_bytes_to_string_as_xml(file_name, target_dist, metadata_dist):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    file_bytes = read_file_to_bytearray(original_filepath)
+
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+    result_xml, metadata = extractor.extract_file_to_string(original_filepath)
+    result_text = extract_body_text(result_xml)
+
+    # Check Expected
+    assert cosine_similarity(result_text, expected) >= target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity >= metadata_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+
 @pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
 def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
     """Test the extraction from bytes of various file types."""
@@ -76,4 +106,36 @@ def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist):
     # Check metadata
     percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
     assert percent_similarity >= metadata_dist, \
-        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
\ No newline at end of file
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+
+
+@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
+def test_extract_bytes_to_stream_as_xml(file_name, target_dist, metadata_dist):
+    """Test the extraction from bytes to stream as xml of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    file_bytes = read_file_to_bytearray(original_filepath)
+
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+    reader, metadata = extractor.extract_bytes(file_bytes)
+    result_xml = read_to_string(reader)
+    result_text = extract_body_text(result_xml)
+
+    # Check Expected
+    assert cosine_similarity(result_text, expected) >= target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity >= metadata_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
diff --git a/bindings/extractous-python/tests/test_extract_file.py b/bindings/extractous-python/tests/test_extract_file.py
index 15b97cc..6b0b8c9 100644
--- a/bindings/extractous-python/tests/test_extract_file.py
+++ b/bindings/extractous-python/tests/test_extract_file.py
@@ -2,7 +2,7 @@
 import pytest
 
 from extractous import Extractor
-from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string
+from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string, extract_body_text
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9, 0.8),
@@ -21,7 +21,7 @@
 
 @pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
 def test_extract_file_to_string(file_name, target_dist, metadata_dist):
-    """Test the extraction and comparison of various file types."""
+    """Test the extraction to string as plain text of various file types."""
     original_filepath = f"../../test_files/documents/{file_name}"
     expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
     expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
@@ -49,10 +49,39 @@ def test_extract_file_to_string(file_name, target_dist, metadata_dist):
     assert percent_similarity >= metadata_dist, \
         f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
 
+@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
+def test_extract_file_to_string_as_xml(file_name, target_dist, metadata_dist):
+    """Test the extraction to string as XML of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+    result_xml, metadata = extractor.extract_file_to_string(original_filepath)
+    result_text = extract_body_text(result_xml)
+
+    # Check extracted
+    assert cosine_similarity(result_text, expected) >= target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity >= metadata_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+
+
 
 @pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
 def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
-    """Test the extraction from bytes of various file types."""
+    """Test the extraction from bytes to stream of various file types."""
     original_filepath = f"../../test_files/documents/{file_name}"
     expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
     expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
@@ -75,4 +104,34 @@ def test_extract_file_to_stream(file_name, target_dist, metadata_dist):
     # Check metadata
     percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
     assert percent_similarity >= metadata_dist, \
-        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
\ No newline at end of file
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
+
+
+@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES)
+def test_extract_file_to_stream_as_xml(file_name, target_dist, metadata_dist):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+
+    # Read expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    # Extract
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+    reader, metadata = extractor.extract_file(original_filepath)
+    result_xml = read_to_string(reader)
+    result_text = extract_body_text(result_xml)
+
+    # Check extracted
+    assert cosine_similarity(result_text, expected) >= target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
+    # Check metadata
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity >= metadata_dist, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py
index 34e4acf..598771a 100644
--- a/bindings/extractous-python/tests/test_extract_url.py
+++ b/bindings/extractous-python/tests/test_extract_url.py
@@ -16,4 +16,13 @@ def test_extract_url_to_string():
     content, metadata  = extractor.extract_url_to_string("https://www.google.com")
 
     assert "Google" in content
-    assert len(metadata.keys()) > 0
\ No newline at end of file
+    assert len(metadata.keys()) > 0
+
+def test_extract_url_to_string_as_xml():
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+
+    content, metadata  = extractor.extract_url_to_string("https://www.google.com")
+
+    assert "Google" in content
+    assert len(metadata.keys()) > 0
diff --git a/bindings/extractous-python/tests/test_pdf.py b/bindings/extractous-python/tests/test_pdf.py
index a04163d..7c96d84 100644
--- a/bindings/extractous-python/tests/test_pdf.py
+++ b/bindings/extractous-python/tests/test_pdf.py
@@ -1,5 +1,5 @@
 from extractous import Extractor
-from utils import read_to_string
+from utils import read_to_string, extract_body_text
 
 
 def expected_result():
@@ -22,6 +22,17 @@ def test_extract_file():
     print(f"test_pdf:test_extract_file result = {result}")
     assert result == expected_result()
 
+def test_extract_file_as_xml():
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+    reader, metadata = extractor.extract_file("tests/quarkus.pdf")
+
+    result_xml = read_to_string(reader)
+
+    print(f"test_pdf:test_extract_file_as_xml result = {result_xml}")
+    result_text = extract_body_text(result_xml)
+    assert result_text.strip() == expected_result().strip()
+
 def test_extract_bytes():
     extractor = Extractor()
 
@@ -33,3 +44,17 @@ def test_extract_bytes():
 
     print(f"test_pdf:test_extract_bytes result = {result}")
     assert result == expected_result()
+
+def test_extract_bytes_as_xml():
+    extractor = Extractor()
+    extractor = extractor.set_xml_output(True)
+
+    with open("tests/quarkus.pdf", "rb") as file:
+        buffer = bytearray(file.read())
+    reader, metadata = extractor.extract_bytes(buffer)
+
+    result_xml = read_to_string(reader)
+
+    print(f"test_pdf:test_extract_bytes_as_xml result = {result_xml}")
+    result_text = extract_body_text(result_xml)
+    assert result_text.strip() == expected_result().strip()
diff --git a/bindings/extractous-python/tests/utils.py b/bindings/extractous-python/tests/utils.py
index 8368db9..c72f825 100644
--- a/bindings/extractous-python/tests/utils.py
+++ b/bindings/extractous-python/tests/utils.py
@@ -1,6 +1,6 @@
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity as cosine_sim
-
+from lxml import etree
 
 def cosine_similarity(text1, text2):
     """Calculate the cosine similarity between two texts."""
@@ -78,3 +78,20 @@ def calculate_similarity_percent(expected, current):
 
     # Return the similarity percentage
     return matches / total
+
+
+def extract_body_text(xml: str) -> str:
+    """
+    Extracts and returns plain text content from the <body> section of an XML
+    string.
+    """
+    try:
+        parser = etree.XMLParser(recover=True)
+        root = etree.fromstring(xml.encode(), parser=parser)
+        ns= {"ns": "http://www.w3.org/1999/xhtml"}
+        body = root.find(".//ns:body", namespaces=ns)
+        if body is None:
+            return ""
+        return "\n".join(body.itertext()).strip()
+    except ET.ParseError as e:
+        raise ValueError(f"Invalid XML input: {e}")
diff --git a/extractous-core/Cargo.lock b/extractous-core/Cargo.lock
index e5e4fe4..8c081e6 100644
--- a/extractous-core/Cargo.lock
+++ b/extractous-core/Cargo.lock
@@ -465,7 +465,7 @@ dependencies = [
 
 [[package]]
 name = "extractous"
-version = "0.2.0"
+version = "0.2.1"
 dependencies = [
  "bytemuck",
  "criterion",
@@ -473,6 +473,7 @@ dependencies = [
  "fs_extra",
  "jni",
  "libc",
+ "quick-xml",
  "reqwest",
  "serde",
  "serde_json",
@@ -482,6 +483,7 @@ dependencies = [
  "test-case",
  "textdistance",
  "thiserror",
+ "walkdir",
  "zip",
 ]
 
@@ -1324,6 +1326,15 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "quick-xml"
+version = "0.37.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.37"
diff --git a/extractous-core/Cargo.toml b/extractous-core/Cargo.toml
index 01251b6..8472540 100644
--- a/extractous-core/Cargo.toml
+++ b/extractous-core/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "extractous"
-version = "0.2.0"
+version = "0.2.1"
 edition = "2021"
 
 description = """
@@ -36,6 +36,7 @@ test-case = "3.0"
 criterion = "0.5.1"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
+quick-xml = "0.37.1"
 
 [build-dependencies]
 fs_extra = { version = "1.3.0" }
@@ -43,6 +44,7 @@ reqwest = { version = "0.12.7", features = ["blocking", "json"] }
 zip = "2.2.0"
 flate2 = "1.0.33"
 tar = "0.4.41"
+walkdir = "2.5.0"
 
 [profile.release]
 opt-level = 3
diff --git a/extractous-core/README.md b/extractous-core/README.md
index 0c87db6..83e6e30 100644
--- a/extractous-core/README.md
+++ b/extractous-core/README.md
@@ -43,7 +43,10 @@ fn main() {
   let file_path = &args[1];
 
   // Extract the provided file content to a string
-  let extractor = Extractor::new();
+  let mut extractor = Extractor::new();
+  // if you need an xml
+  // extractor = extractor.set_xml_output(false);
+  // Extract text from a file
   let (content, metadata) = extractor.extract_file_to_string(file_path).unwrap();
   println!("{}", content);
   println!("{:?}", metadata);
@@ -128,7 +131,7 @@ installed on your system because some of the OCR tests will fail if no tesseract
 * `sudo apt install tesseract-ocr`
 * Install any language extensions you want. for example to install German and Arabic:
 * `sudo apt install tesseract-ocr-deu tesseract-ocr-ara`
-* On Mac 
+* On Mac
 * `brew install tesseract tesseract-lang`
 
 ### Building Extractous
diff --git a/extractous-core/build.rs b/extractous-core/build.rs
index 1aeeb17..55046a4 100644
--- a/extractous-core/build.rs
+++ b/extractous-core/build.rs
@@ -3,6 +3,7 @@ use std::fs;
 use std::io;
 use std::path::{Path, PathBuf};
 use std::process::Command;
+use walkdir::WalkDir;
 
 fn main() {
     // Exit early when building docs or when running clippy
@@ -29,6 +30,17 @@ fn main() {
     //println!("cargo:warning=dist_dir: {}", dist_dir.display());
     //println!("cargo:warning=out_dir: {}", out_dir.display());
     //println!("cargo:warning=tika_native_dir: {:?}", tika_native_dir);
+    let tika_native_dir = out_dir.join("tika-native");
+    let mut need_build = false;
+    if is_dir_updated(&tika_native_source_dir, &tika_native_dir) {
+        println!("Lib tika_native files were updated");
+        fs_extra::dir::remove(&libs_out_dir).ok();
+        fs_extra::dir::remove(&tika_native_dir).ok();
+        need_build = true;
+        // Launch the gradle build
+    } else {
+        println!("Lib tika_native files were not updated");
+    }
 
     // Try to find already built libs
     match find_already_built_libs(&out_dir) {
@@ -39,15 +51,17 @@ fn main() {
                 copy_build_artifacts(&libs_dir, vec![&libs_out_dir], false);
             }
         }
-        None => {
-            // Launch the gradle build
-            gradle_build(
-                &tika_native_source_dir,
-                &out_dir,
-                &libs_out_dir,
-                &python_bind_dir,
-            );
-        }
+        None => { need_build = true; }
+    }
+
+    // Launch the gradle build
+    if need_build {
+        gradle_build(
+            &tika_native_source_dir,
+            &out_dir,
+            &libs_out_dir,
+            &python_bind_dir,
+        );
     }
 
     // Tell cargo to look for shared libraries in the specified directory
@@ -93,17 +107,35 @@ fn find_already_built_libs(out_dir: &Path) -> Option<PathBuf> {
 }
 
 fn is_dir_updated(src: &Path, dest: &Path) -> bool {
-    let src_modified = fs::metadata(src)
-        .and_then(|meta| meta.modified())
-        .ok();
-    let dest_modified = fs::metadata(dest)
-        .and_then(|meta| meta.modified())
-        .ok();
-
-    match (src_modified, dest_modified) {
-        (Some(src_time), Some(dest_time)) => src_time > dest_time,
-        _ => true, // If either timestamp is unavailable, consider the source as updated
+    for entry in WalkDir::new(src).into_iter().filter_map(|e| e.ok()) {
+        if entry.file_type().is_file() {
+            let src_file = entry.path();
+            let relative_path = src_file.strip_prefix(src).unwrap();
+            let dest_file = dest.join(relative_path);
+
+            if !dest_file.exists() {
+                // File does not exist in the destination directory
+                return true;
+            }
+
+            let src_modified = match fs::metadata(src_file).and_then(|meta| meta.modified()) {
+                Ok(time) => time,
+                Err(_) => continue, // Skip unreadable files
+            };
+
+            let dest_modified = match fs::metadata(&dest_file).and_then(|meta| meta.modified()) {
+                Ok(time) => time,
+                Err(_) => return true, // File in dest is inaccessible
+            };
+
+            if src_modified > dest_modified {
+                // Source file is newer than the destination file
+                return true;
+            }
+        }
     }
+    // All checks passed
+    false
 }
 
 // Run the gradle build command to build tika-native
@@ -122,7 +154,7 @@ fn gradle_build(
     println!("Using GraalVM JDK found at {}", graalvm_home.display());
     println!("Building tika_native libs this might take a while ... Please be patient!!");
 
-    if is_dir_updated(&tika_native_dir, &out_dir) {
+    if is_dir_updated(&tika_native_source_dir, &tika_native_dir) {
         println!("Lib tika_native files were updated");
         fs_extra::dir::remove(&tika_native_dir).ok();
     }
diff --git a/extractous-core/examples/extract_to_stream.rs b/extractous-core/examples/extract_to_stream.rs
index 08d4386..fca3e02 100644
--- a/extractous-core/examples/extract_to_stream.rs
+++ b/extractous-core/examples/extract_to_stream.rs
@@ -8,7 +8,7 @@ fn main() {
     let file_path = &args[1];
 
     // Extract the provided file content to a string
-    let extractor = Extractor::new();
+    let extractor = Extractor::new().set_xml_output(true);
     let (stream, _metadata) = extractor.extract_file(file_path).unwrap();
     // Extract url
     // let stream = extractor.extract_url("https://www.google.com/").unwrap();
diff --git a/extractous-core/examples/extract_to_string.rs b/extractous-core/examples/extract_to_string.rs
index 36b2916..65db3f9 100644
--- a/extractous-core/examples/extract_to_string.rs
+++ b/extractous-core/examples/extract_to_string.rs
@@ -6,7 +6,7 @@ fn main() {
     let file_path = &args[1];
 
     // Extract the provided file content to a string
-    let extractor = Extractor::new();
+    let extractor = Extractor::new().set_xml_output(true);
     let (content, _metadata) = extractor.extract_file_to_string(file_path).unwrap();
     println!("{}", content);
 }
diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs
index b8f220d..5509de0 100644
--- a/extractous-core/src/extractor.rs
+++ b/extractous-core/src/extractor.rs
@@ -65,6 +65,7 @@ pub struct Extractor {
     pdf_config: PdfParserConfig,
     office_config: OfficeParserConfig,
     ocr_config: TesseractOcrConfig,
+    xml_output: bool,
 }
 
 impl Default for Extractor {
@@ -75,6 +76,7 @@ impl Default for Extractor {
             pdf_config: PdfParserConfig::default(),
             office_config: OfficeParserConfig::default(),
             ocr_config: TesseractOcrConfig::default(),
+            xml_output: false,
         }
     }
 }
@@ -117,6 +119,12 @@ impl Extractor {
         self
     }
 
+    /// Set the configuration for the parse as xml
+    pub fn set_xml_output(mut self, xml_output: bool) -> Self {
+        self.xml_output = xml_output;
+        self
+    }
+
     /// Extracts text from a file path. Returns a tuple with stream of the extracted text and metadata.
     /// the stream is decoded using the extractor's `encoding`
     pub fn extract_file(&self, file_path: &str) -> ExtractResult<(StreamReader, Metadata)> {
@@ -126,6 +134,7 @@ impl Extractor {
             &self.pdf_config,
             &self.office_config,
             &self.ocr_config,
+            self.xml_output,
         )
     }
 
@@ -138,6 +147,7 @@ impl Extractor {
             &self.pdf_config,
             &self.office_config,
             &self.ocr_config,
+            self.xml_output,
         )
     }
 
@@ -150,6 +160,7 @@ impl Extractor {
             &self.pdf_config,
             &self.office_config,
             &self.ocr_config,
+            self.xml_output,
         )
     }
 
@@ -162,6 +173,7 @@ impl Extractor {
             &self.pdf_config,
             &self.office_config,
             &self.ocr_config,
+            self.xml_output,
         )
     }
 
@@ -174,6 +186,7 @@ impl Extractor {
             &self.pdf_config,
             &self.office_config,
             &self.ocr_config,
+            self.xml_output,
         )
     }
 
@@ -186,8 +199,10 @@ impl Extractor {
             &self.pdf_config,
             &self.office_config,
             &self.ocr_config,
+            self.xml_output,
         )
     }
+
 }
 
 #[cfg(test)]
@@ -197,6 +212,7 @@ mod tests {
     use std::fs::File;
     use std::io::BufReader;
     use std::io::{self, Read};
+    use std::str;
 
     const TEST_FILE: &str = "README.md";
 
@@ -292,4 +308,20 @@ mod tests {
             "Metadata should contain at least one entry"
         );
     }
+
+    #[test]
+    fn extract_file_to_xml_test() {
+        // Parse the files using extractous
+        let extractor = Extractor::new().set_xml_output(true);
+        let result = extractor.extract_file_to_string(TEST_FILE);
+        let (content, metadata) = result.unwrap();
+        assert!(
+            content.len() > 0,
+            "Metadata should contain at least one entry"
+        );
+        assert!(
+            metadata.len() > 0,
+            "Metadata should contain at least one entry"
+        );
+    }
 }
diff --git a/extractous-core/src/tika/parse.rs b/extractous-core/src/tika/parse.rs
index 4f941af..fe1f78b 100644
--- a/extractous-core/src/tika/parse.rs
+++ b/extractous-core/src/tika/parse.rs
@@ -32,6 +32,7 @@ fn parse_to_stream(
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
+    as_xml: bool,
     method_name: &str,
     signature: &str,
 ) -> ExtractResult<(StreamReader, Metadata)> {
@@ -52,6 +53,7 @@ fn parse_to_stream(
             (&j_pdf_conf.internal).into(),
             (&j_office_conf.internal).into(),
             (&j_ocr_conf.internal).into(),
+            JValue::Bool(if as_xml { 1 } else { 0 }),
         ],
     );
     let call_result_obj = call_result?.l()?;
@@ -69,6 +71,7 @@ pub fn parse_file(
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
+    as_xml: bool
 ) -> ExtractResult<(StreamReader, Metadata)> {
     let mut env = get_vm_attach_current_thread()?;
 
@@ -80,12 +83,14 @@ pub fn parse_file(
         pdf_conf,
         office_conf,
         ocr_conf,
+        as_xml,
         "parseFile",
         "(Ljava/lang/String;\
         Ljava/lang/String;\
         Lorg/apache/tika/parser/pdf/PDFParserConfig;\
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
         Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        Z\
         )Lai/yobix/ReaderResult;",
     )
 }
@@ -96,6 +101,7 @@ pub fn parse_bytes(
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
+    as_xml: bool,
 ) -> ExtractResult<(StreamReader, Metadata)> {
     let mut env = get_vm_attach_current_thread()?;
 
@@ -112,12 +118,14 @@ pub fn parse_bytes(
         pdf_conf,
         office_conf,
         ocr_conf,
+        as_xml,
         "parseBytes",
         "(Ljava/nio/ByteBuffer;\
         Ljava/lang/String;\
         Lorg/apache/tika/parser/pdf/PDFParserConfig;\
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
         Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        Z\
         )Lai/yobix/ReaderResult;",
     )
 }
@@ -128,6 +136,7 @@ pub fn parse_url(
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
+    as_xml: bool,
 ) -> ExtractResult<(StreamReader, Metadata)> {
     let mut env = get_vm_attach_current_thread()?;
 
@@ -139,12 +148,14 @@ pub fn parse_url(
         pdf_conf,
         office_conf,
         ocr_conf,
+        as_xml,
         "parseUrl",
         "(Ljava/lang/String;\
         Ljava/lang/String;\
         Lorg/apache/tika/parser/pdf/PDFParserConfig;\
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
         Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        Z\
         )Lai/yobix/ReaderResult;",
     )
 }
@@ -157,6 +168,7 @@ pub fn parse_to_string(
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
+    as_xml: bool,
     method_name: &str,
     signature: &str,
 ) -> ExtractResult<(String, Metadata)> {
@@ -175,6 +187,7 @@ pub fn parse_to_string(
             (&j_pdf_conf.internal).into(),
             (&j_office_conf.internal).into(),
             (&j_ocr_conf.internal).into(),
+            JValue::Bool(if as_xml { 1 } else { 0 }),
         ],
     );
     let call_result_obj = call_result?.l()?;
@@ -191,6 +204,7 @@ pub fn parse_file_to_string(
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
+    as_xml: bool,
 ) -> ExtractResult<(String, Metadata)> {
     let mut env = get_vm_attach_current_thread()?;
 
@@ -202,12 +216,14 @@ pub fn parse_file_to_string(
         pdf_conf,
         office_conf,
         ocr_conf,
+        as_xml,
         "parseFileToString",
         "(Ljava/lang/String;\
         I\
         Lorg/apache/tika/parser/pdf/PDFParserConfig;\
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
         Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        Z\
         )Lai/yobix/StringResult;",
     )
 }
@@ -219,6 +235,7 @@ pub fn parse_bytes_to_string(
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
+    as_xml: bool,
 ) -> ExtractResult<(String, Metadata)> {
     let mut env = get_vm_attach_current_thread()?;
 
@@ -235,12 +252,14 @@ pub fn parse_bytes_to_string(
         pdf_conf,
         office_conf,
         ocr_conf,
+        as_xml,
         "parseBytesToString",
         "(Ljava/nio/ByteBuffer;\
         I\
         Lorg/apache/tika/parser/pdf/PDFParserConfig;\
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
         Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        Z\
         )Lai/yobix/StringResult;",
     )
 }
@@ -252,6 +271,7 @@ pub fn parse_url_to_string(
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
+    as_xml: bool,
 ) -> ExtractResult<(String, Metadata)> {
     let mut env = get_vm_attach_current_thread()?;
 
@@ -263,12 +283,14 @@ pub fn parse_url_to_string(
         pdf_conf,
         office_conf,
         ocr_conf,
+        as_xml,
         "parseUrlToString",
         "(Ljava/lang/String;\
         I\
         Lorg/apache/tika/parser/pdf/PDFParserConfig;\
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
         Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        Z\
         )Lai/yobix/StringResult;",
     )
 }
diff --git a/extractous-core/tests/extract_to_xml_tests.rs b/extractous-core/tests/extract_to_xml_tests.rs
new file mode 100644
index 0000000..a645053
--- /dev/null
+++ b/extractous-core/tests/extract_to_xml_tests.rs
@@ -0,0 +1,90 @@
+use extractous::Extractor;
+use std::fs;
+use test_case::test_case;
+use textdistance::nstr::cosine;
+use quick_xml::reader::Reader;
+use quick_xml::events::Event;
+
+
+// Declarers the shared test_utils code as module in this integration test
+mod test_utils;
+
+fn extract_p_tag_content(xml: &str) -> String {
+    let mut reader = Reader::from_str(xml);
+    reader.config_mut().trim_text(true); // Trim surrounding whitespace
+    let mut buf = Vec::new();
+    let mut collected_content = String::new();
+    let mut inside_body = false;
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(Event::Start(ref e)) if e.name().as_ref() == b"body" => {
+                inside_body = true;
+            }
+            Ok(Event::End(ref e)) if e.name().as_ref() == b"body" => {
+                inside_body = false;
+            }
+            Ok(Event::Text(e)) if inside_body => {
+                collected_content.push_str(&e.unescape().unwrap().into_owned());
+                collected_content.push('\n'); // Separate paragraphs with newline
+            }
+            Ok(Event::Eof) => break,
+            Err(e) => {
+                eprintln!("Error reading XML: {}", e);
+                break;
+            }
+            _ => (),
+        }
+        buf.clear();
+    }
+
+    collected_content.trim_end().to_string()
+}
+
+#[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")]
+#[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")]
+#[test_case("simple.odt", 0.8; "Test ODT file")]
+#[test_case("table-multi-row-column-cells-actual.csv", 0.8; "Test CSV file")]
+#[test_case("vodafone.xlsx", 0.4; "Test XLSX file")]
+#[test_case("category-level.docx", 0.8; "Test DOCX file")]
+#[test_case("simple.doc", 0.8; "Test DOC file")]
+#[test_case("simple.pptx", 0.9; "Test another PPTX file")]
+#[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")]
+#[test_case("winter-sports.epub", 0.8; "Test EPUB file")]
+#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")]
+//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
+fn test_extract_file_to_xml(file_name: &str, target_dist: f64) {
+    let extractor = Extractor::new().set_extract_string_max_length(1000000)
+        .set_xml_output(true);
+    // extract file with extractor
+    let (extracted_xml, extracted_metadata) = extractor
+        .extract_file_to_string(&format!("../test_files/documents/{}", file_name))
+        .unwrap();
+    println!("{}: {}", file_name, extracted_xml);
+    let extracted = extract_p_tag_content(&extracted_xml);
+
+    // read expected string
+    let expected =
+        fs::read_to_string(format!("../test_files/expected_result/{}.txt", file_name)).unwrap();
+
+    let dist = cosine(&expected.trim(), &extracted.trim());
+    assert!(
+        dist > target_dist,
+        "Cosine similarity is less than {} for file: {}, dist: {}",
+        target_dist,
+        file_name,
+        dist
+    );
+    println!("{}: {}", file_name, dist);
+
+    // read expected metadata
+    let expected_metadata = test_utils::parse_metadata_file(&format!(
+        "../test_files/expected_result/{}.metadata.json",
+        file_name
+    ));
+
+    assert!(test_utils::is_expected_metadata_contained(
+        &expected_metadata,
+        &extracted_metadata
+    ));
+}
diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/ParsingReader.java b/extractous-core/tika-native/src/main/java/ai/yobix/ParsingReader.java
new file mode 100644
index 0000000..8a7d621
--- /dev/null
+++ b/extractous-core/tika-native/src/main/java/ai/yobix/ParsingReader.java
@@ -0,0 +1,104 @@
+package ai.yobix;
+
+import java.io.*;
+import java.util.concurrent.Executor;
+
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+import org.apache.tika.exception.ZeroByteFileException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+
+public class ParsingReader extends Reader {
+
+    private final Parser parser;
+    private final Reader reader;
+    private final PipedOutputStream pipedOutputStream;
+    private final InputStream stream;
+    private final Metadata metadata;
+    private final ParseContext context;
+    private final boolean outputXml;
+    private final String encoding;
+    private transient Throwable throwable;
+
+    public ParsingReader(Parser parser, InputStream stream, Metadata metadata,
+                            ParseContext context, boolean outputXml, String encoding) throws IOException {
+        this.parser = parser;
+        this.stream = stream;
+        this.metadata = metadata;
+        this.context = context;
+        this.outputXml = outputXml;
+        this.encoding = encoding;
+
+        PipedInputStream pipedInputStream = new PipedInputStream();
+        this.pipedOutputStream = new PipedOutputStream(pipedInputStream);
+        this.reader = new BufferedReader(new InputStreamReader(pipedInputStream));
+
+        Executor executor = command -> {
+            String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+            if (name != null) {
+                name = "Apache Tika: " + name;
+            } else {
+                name = "Apache Tika";
+            }
+            Thread thread = new Thread(command, name);
+            thread.setDaemon(true);
+            thread.start();
+        };
+
+        executor.execute(new ParsingTask());
+
+        reader.mark(1);
+        reader.read();
+        reader.reset();
+    }
+
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+        if (throwable instanceof ZeroByteFileException) {
+            return -1;
+        } else if (throwable instanceof IOException) {
+            throw (IOException) throwable;
+        } else if (throwable != null) {
+            throw new IOException("", throwable);
+        }
+        return reader.read(cbuf, off, len);
+    }
+
+    @Override
+    public void close() throws IOException {
+        reader.close();
+    }
+
+    private class ParsingTask implements Runnable {
+
+        public void run() {
+            try {
+                ContentHandler handler = outputXml ? new ToXMLContentHandler(pipedOutputStream, encoding) : new BodyContentHandler(pipedOutputStream);
+                parser.parse(stream, handler, metadata, context);
+            } catch (Throwable t) {
+                throwable = t;
+            }
+
+            try {
+                stream.close();
+            } catch (Throwable t) {
+                if (throwable == null) {
+                    throwable = t;
+                }
+            }
+
+            try {
+                pipedOutputStream.close();
+            } catch (Throwable t) {
+                if (throwable == null) {
+                    throwable = t;
+                }
+            }
+        }
+
+    }
+}
diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
index f9f8e4a..699a47f 100644
--- a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
+++ b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
@@ -11,17 +11,18 @@
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParsingReader;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.graalvm.nativeimage.IsolateThread;
 import org.graalvm.nativeimage.c.function.CEntryPoint;
 import org.graalvm.nativeimage.c.type.CCharPointer;
 import org.graalvm.nativeimage.c.type.CConst;
 import org.graalvm.nativeimage.c.type.CTypeConversion;
+import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import java.io.IOException;
@@ -74,7 +75,9 @@ public static StringResult parseFileToString(
             int maxLength,
             PDFParserConfig pdfConfig,
             OfficeParserConfig officeConfig,
-            TesseractOCRConfig tesseractConfig
+            TesseractOCRConfig tesseractConfig,
+            boolean asXML
+            // maybe replace with a single config class
     ) {
         try {
             final Path path = Paths.get(filePath);
@@ -82,10 +85,9 @@ public static StringResult parseFileToString(
             final InputStream stream = TikaInputStream.get(path, metadata);
 
             String result = parseToStringWithConfig(
-                    stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig);
+                    stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig, asXML);
             // No need to close the stream because parseToString does so
             return new StringResult(result, metadata);
-
         } catch (java.io.IOException e) {
             return new StringResult((byte) 1, "Could not open file: " + e.getMessage());
         } catch (TikaException e) {
@@ -104,7 +106,8 @@ public static StringResult parseUrlToString(
             int maxLength,
             PDFParserConfig pdfConfig,
             OfficeParserConfig officeConfig,
-            TesseractOCRConfig tesseractConfig
+            TesseractOCRConfig tesseractConfig,
+            boolean asXML
     ) {
         try {
             final URL url = new URI(urlString).toURL();
@@ -112,7 +115,7 @@ public static StringResult parseUrlToString(
             final TikaInputStream stream = TikaInputStream.get(url, metadata);
 
             String result = parseToStringWithConfig(
-                    stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig);
+                    stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig, asXML);
             // No need to close the stream because parseToString does so
             return new StringResult(result, metadata);
 
@@ -138,7 +141,8 @@ public static StringResult parseBytesToString(
             int maxLength,
             PDFParserConfig pdfConfig,
             OfficeParserConfig officeConfig,
-            TesseractOCRConfig tesseractConfig
+            TesseractOCRConfig tesseractConfig,
+            boolean asXML
     ) {
         final Metadata metadata = new Metadata();
         final ByteBufferInputStream inStream = new ByteBufferInputStream(data);
@@ -146,7 +150,7 @@ public static StringResult parseBytesToString(
 
         try {
             String result = parseToStringWithConfig(
-                    stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig);
+                    stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig, asXML);
             // No need to close the stream because parseToString does so
             return new StringResult(result, metadata);
         } catch (java.io.IOException e) {
@@ -156,16 +160,24 @@ public static StringResult parseBytesToString(
         }
     }
 
-
     private static String parseToStringWithConfig(
             InputStream stream,
             Metadata metadata,
             int maxLength,
             PDFParserConfig pdfConfig,
             OfficeParserConfig officeConfig,
-            TesseractOCRConfig tesseractConfig
+            TesseractOCRConfig tesseractConfig,
+            boolean asXML
     ) throws IOException, TikaException {
-        final WriteOutContentHandler handler = new WriteOutContentHandler(maxLength);
+        ContentHandler handler;
+        ContentHandler handlerForParser;
+        if (asXML) {
+            handler = new WriteOutContentHandler(new ToXMLContentHandler(), maxLength);
+            handlerForParser = handler;
+        } else {
+            handler = new WriteOutContentHandler(maxLength);
+            handlerForParser = new BodyContentHandler(handler);
+        }
 
         try {
             final TikaConfig config = TikaConfig.getDefaultConfig();
@@ -177,8 +189,7 @@ private static String parseToStringWithConfig(
             parsecontext.set(OfficeParserConfig.class, officeConfig);
             parsecontext.set(TesseractOCRConfig.class, tesseractConfig);
 
-            parser.parse(stream, new BodyContentHandler(handler), metadata, parsecontext);
-
+            parser.parse(stream, handlerForParser, metadata, parsecontext);
         } catch (SAXException e) {
             if (!WriteLimitReachedException.isWriteLimitReached(e)) {
                 // This should never happen with BodyContentHandler...
@@ -203,7 +214,8 @@ public static ReaderResult parseFile(
             String charsetName,
             PDFParserConfig pdfConfig,
             OfficeParserConfig officeConfig,
-            TesseractOCRConfig tesseractConfig
+            TesseractOCRConfig tesseractConfig,
+            boolean asXML
     ) {
         try {
 //            System.out.println("pdfConfig.isExtractInlineImages = " + pdfConfig.isExtractInlineImages());
@@ -218,7 +230,7 @@ public static ReaderResult parseFile(
             final Metadata metadata = new Metadata();
             final TikaInputStream stream = TikaInputStream.get(path, metadata);
 
-            return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig);
+            return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig, asXML);
 
         } catch (java.io.IOException e) {
             return new ReaderResult((byte) 1, "Could not open file: " + e.getMessage());
@@ -237,14 +249,15 @@ public static ReaderResult parseUrl(
             String charsetName,
             PDFParserConfig pdfConfig,
             OfficeParserConfig officeConfig,
-            TesseractOCRConfig tesseractConfig
+            TesseractOCRConfig tesseractConfig,
+            boolean asXML
     ) {
         try {
             final URL url = new URI(urlString).toURL();
             final Metadata metadata = new Metadata();
             final TikaInputStream stream = TikaInputStream.get(url, metadata);
 
-            return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig);
+            return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig, asXML);
 
         } catch (MalformedURLException e) {
             return new ReaderResult((byte) 2, "Malformed URL error occurred " + e.getMessage());
@@ -267,7 +280,8 @@ public static ReaderResult parseBytes(
             String charsetName,
             PDFParserConfig pdfConfig,
             OfficeParserConfig officeConfig,
-            TesseractOCRConfig tesseractConfig
+            TesseractOCRConfig tesseractConfig,
+            boolean asXML
     ) {
 
 
@@ -275,7 +289,7 @@ public static ReaderResult parseBytes(
         final ByteBufferInputStream inStream = new ByteBufferInputStream(data);
         final TikaInputStream stream = TikaInputStream.get(inStream, new TemporaryResources(), metadata);
 
-        return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig);
+        return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig, asXML);
     }
 
     private static ReaderResult parse(
@@ -284,25 +298,28 @@ private static ReaderResult parse(
             String charsetName,
             PDFParserConfig pdfConfig,
             OfficeParserConfig officeConfig,
-            TesseractOCRConfig tesseractConfig
+            TesseractOCRConfig tesseractConfig,
+            boolean asXML
     ) {
         try {
 
             final TikaConfig config = TikaConfig.getDefaultConfig();
             final ParseContext parsecontext = new ParseContext();
             final Parser parser = new AutoDetectParser(config);
+            final Charset charset = Charset.forName(charsetName, StandardCharsets.UTF_8);
 
             parsecontext.set(Parser.class, parser);
             parsecontext.set(PDFParserConfig.class, pdfConfig);
             parsecontext.set(OfficeParserConfig.class, officeConfig);
             parsecontext.set(TesseractOCRConfig.class, tesseractConfig);
 
-            final Reader reader = new ParsingReader(parser, inputStream, metadata, parsecontext);
+            //final Reader reader = new org.apache.tika.parser.ParsingReader(parser, inputStream, metadata, parsecontext);
+            final Reader reader = new ParsingReader(parser, inputStream, metadata, parsecontext, asXML, charset.name());
 
             // Convert Reader which works with chars to ReaderInputStream which works with bytes
             ReaderInputStream readerInputStream = ReaderInputStream.builder()
                     .setReader(reader)
-                    .setCharset(Charset.forName(charsetName, StandardCharsets.UTF_8))
+                    .setCharset(charset)
                     .get();
 
             return new ReaderResult(readerInputStream, metadata);
@@ -336,4 +353,4 @@ private static CCharPointer cParseToString(IsolateThread thread, @CConst CCharPo
         }
     }
 
-}
\ No newline at end of file
+}
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json
index bd46a15..a306d44 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json
@@ -103,7 +103,8 @@
                         "java.lang.String",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -113,7 +114,8 @@
                         "int",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -123,7 +125,8 @@
                         "java.lang.String",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -133,7 +136,8 @@
                         "int",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -143,7 +147,8 @@
                         "java.lang.String",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -153,7 +158,8 @@
                         "int",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 }
             ],
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json
index 8285d7d..2b0cc2c 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json
@@ -36,19 +36,19 @@
                     "parameterTypes": []
                 },
                 {
-                    "name": "getReader",
+                    "name": "getMetadata",
                     "parameterTypes": []
                 },
                 {
-                    "name": "getStatus",
+                    "name": "getReader",
                     "parameterTypes": []
                 },
                 {
-                    "name": "isError",
+                    "name": "getStatus",
                     "parameterTypes": []
                 },
                 {
-                    "name": "getMetadata",
+                    "name": "isError",
                     "parameterTypes": []
                 }
             ],
@@ -65,35 +65,20 @@
                     "parameterTypes": []
                 },
                 {
-                    "name": "getStatus",
+                    "name": "getMetadata",
                     "parameterTypes": []
                 },
                 {
-                    "name": "isError",
+                    "name": "getStatus",
                     "parameterTypes": []
                 },
                 {
-                    "name": "getMetadata",
+                    "name": "isError",
                     "parameterTypes": []
                 }
             ],
             "type": "ai.yobix.StringResult"
         },
-        {
-            "methods": [
-                {
-                    "name": "getValues",
-                    "parameterTypes": [
-                        "java.lang.String"
-                    ]
-                },
-                {
-                    "name": "names",
-                    "parameterTypes": []
-                }
-            ],
-            "type": "org.apache.tika.metadata.Metadata"
-        },
         {
             "methods": [
                 {
@@ -109,7 +94,8 @@
                         "java.lang.String",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -119,7 +105,8 @@
                         "int",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -129,7 +116,8 @@
                         "java.lang.String",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -139,7 +127,8 @@
                         "int",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -149,7 +138,8 @@
                         "java.lang.String",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -159,7 +149,8 @@
                         "int",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 }
             ],
@@ -741,6 +732,21 @@
             ],
             "type": "org.apache.commons.io.input.ReaderInputStream"
         },
+        {
+            "methods": [
+                {
+                    "name": "getValues",
+                    "parameterTypes": [
+                        "java.lang.String"
+                    ]
+                },
+                {
+                    "name": "names",
+                    "parameterTypes": []
+                }
+            ],
+            "type": "org.apache.tika.metadata.Metadata"
+        },
         {
             "methods": [
                 {
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json
index f703e05..61a8f2e 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json
@@ -103,7 +103,8 @@
                         "java.lang.String",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -113,7 +114,8 @@
                         "int",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -123,7 +125,8 @@
                         "java.lang.String",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -133,7 +136,8 @@
                         "int",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -143,7 +147,8 @@
                         "java.lang.String",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 },
                 {
@@ -153,7 +158,8 @@
                         "int",
                         "org.apache.tika.parser.pdf.PDFParserConfig",
                         "org.apache.tika.parser.microsoft.OfficeParserConfig",
-                        "org.apache.tika.parser.ocr.TesseractOCRConfig"
+                        "org.apache.tika.parser.ocr.TesseractOCRConfig",
+                        "boolean"
                     ]
                 }
             ],