What is the fastest Python library for DOCX, XLSX, and PPTX?

Office Oxide is the fastest. DOCX text extraction averages 0.8ms (vs 11.8ms for python-docx — 14× faster). XLSX averages 5.0ms (vs 94.5ms for openpyxl — 18× faster). PPTX averages 0.7ms (vs 32.5ms for python-pptx — 46× faster). Benchmarked on 6,062 real-world files.

Is Office Oxide free for commercial use?

Yes. Office Oxide is dual-licensed MIT OR Apache-2.0 — free for all uses including commercial products, SaaS, and proprietary software. No license fees, no sales calls, no AGPL or copyleft restrictions.

Does Office Oxide handle legacy .doc, .xls, and .ppt files?

Yes. Office Oxide reads all six formats: DOCX, XLSX, PPTX, plus legacy DOC, XLS, PPT. It is the only Rust or Python library that supports all three legacy formats without a JVM (Apache Tika) or external binaries (catdoc, antiword).

Can Office Oxide convert documents to Markdown?

Yes. Every supported format has built-in to_markdown() that preserves headings, tables, lists, and structure — ideal for LLM and RAG pipelines. No separate package needed.

How does Office Oxide compare to calamine and openpyxl for XLSX?

On 1,802 XLSX files: Office Oxide averages 5.0ms (97.8% pass rate). python-calamine averages 13.9ms (96.6%). openpyxl averages 94.5ms (96.2%). Office Oxide is 2.8× faster than calamine and 18× faster than openpyxl, with the highest pass rate.

Does Office Oxide work in the browser?

Yes. Office Oxide ships a WASM build (office-oxide-wasm on npm) that runs in any browser or bundler. Process Office documents client-side with no server round-trips — useful for privacy-sensitive workloads.

Batch-Verarbeitung

Office Oxide ist so schnell, dass bei den meisten Batch-Jobs das Disk-I/O der Flaschenhals ist – nicht das Parsing. Ein typisches Word-Dokument wird in 0,8 ms extrahiert – ein einzelner Thread verarbeitet damit rund 1.000 Dateien pro Sekunde.

Dieser Leitfaden beschreibt skalierbare Muster: serielle Schleifen für kleine Jobs, Thread-Pools für mittlere und asynchrones I/O beim Streaming aus S3 oder HTTP.

Serielle Schleife – der sinnvolle Standard

Für bis zu einige Tausend Dateien auf lokaler Festplatte ist eine einfache serielle Schleife die unkomplizierteste und oft schnellste Wahl. Sie vermeiden den Overhead beim Starten von Workern und die Konkurrenz bei parallelen Lesezugriffen.

Rust

use std::path::Path;
use office_oxide::Document;
use walkdir::WalkDir;

for entry in WalkDir::new("corpus") {
    let entry = entry?;
    let path = entry.path();
    if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
        if matches!(ext.to_ascii_lowercase().as_str(),
                    "docx" | "xlsx" | "pptx" | "doc" | "xls" | "ppt") {
            let doc = Document::open(path)?;
            std::fs::write(path.with_extension("txt"), doc.plain_text())?;
        }
    }
}

Python

from pathlib import Path
from office_oxide import Document

for src in Path("corpus").rglob("*"):
    if src.suffix.lower() in {".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt"}:
        with Document.open(src) as doc:
            text = doc.plain_text()
        src.with_suffix(".txt").write_text(text)

JavaScript

import { readdirSync, statSync, writeFileSync } from 'node:fs';
import { join, extname } from 'node:path';
import { Document } from 'office-oxide';

const exts = new Set(['.docx', '.xlsx', '.pptx', '.doc', '.xls', '.ppt']);

function* walk(dir) {
  for (const name of readdirSync(dir)) {
    const full = join(dir, name);
    if (statSync(full).isDirectory()) yield* walk(full);
    else yield full;
  }
}

for (const src of walk('corpus')) {
  if (!exts.has(extname(src).toLowerCase())) continue;
  using doc = Document.open(src);
  writeFileSync(src.replace(/\.\w+$/, '.txt'), doc.plainText());
}

package main

import (
    "os"
    "path/filepath"
    "strings"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

var exts = map[string]bool{
    ".docx": true, ".xlsx": true, ".pptx": true,
    ".doc": true, ".xls": true, ".ppt": true,
}

func main() {
    filepath.Walk("corpus", func(path string, info os.FileInfo, err error) error {
        if err != nil || info.IsDir() { return err }
        if !exts[strings.ToLower(filepath.Ext(path))] { return nil }
        doc, err := officeoxide.Open(path)
        if err != nil { return nil } // skip unreadable
        defer doc.Close()
        text, _ := doc.PlainText()
        return os.WriteFile(strings.TrimSuffix(path, filepath.Ext(path))+".txt", []byte(text), 0644)
    })
}

using OfficeOxide;

var exts = new HashSet<string> { ".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt" };

foreach (var src in Directory.EnumerateFiles("corpus", "*", SearchOption.AllDirectories))
{
    if (!exts.Contains(Path.GetExtension(src).ToLowerInvariant())) continue;
    using var doc = Document.Open(src);
    File.WriteAllText(Path.ChangeExtension(src, ".txt"), doc.PlainText());
}

#include <stdio.h>
#include "office_oxide.h"

/* Walk a directory however you like (dirent/nftw); for each candidate path: */
static int extract_one(const char *path, const char *out_path) {
    int err = 0;
    OfficeDocumentHandle *doc = office_document_open(path, &err);
    if (!doc) return err;                 /* skip unreadable */
    char *text = office_document_plain_text(doc, &err);
    if (text) {
        FILE *f = fopen(out_path, "w");
        if (f) { fputs(text, f); fclose(f); }
        office_oxide_free_string(text);
    }
    office_document_free(doc);
    return 0;
}

WASM

import { WasmDocument } from 'office-oxide-wasm';

// Browser, user-uploaded files — runs entirely client-side, no server round-trip.
// <input type="file" multiple accept=".docx,.xlsx,.pptx,.doc,.xls,.ppt">
async function extractAll(fileList) {
  const results = [];
  for (const file of fileList) {
    const data = new Uint8Array(await file.arrayBuffer());
    const fmt = file.name.split('.').pop().toLowerCase();
    const doc = new WasmDocument(data, fmt);
    try {
      results.push({ name: file.name, text: doc.plainText() });
    } finally {
      doc.free();
    }
  }
  return results;
}

WASM läuft vollständig clientseitig – Dateien verlassen den Browser nie, ideal für datenschutzsensible Batch-Aufgaben.

Parallel – für große Korpora

Wenn Sie Zehntausende Dateien und eine schnelle SSD haben, bringt Parallelität einen echten Gewinn. Achtung: Zu viele Worker sättigen die Festplatte und senken den Durchsatz.

Rust (rayon)

use rayon::prelude::*;
use office_oxide::Document;

paths.par_iter().for_each(|path| {
    if let Ok(doc) = Document::open(path) {
        let _ = std::fs::write(path.with_extension("md"), doc.to_markdown());
    }
});

Die Standard-Thread-Anzahl von Rayon entspricht der CPU-Kernanzahl – das ist fast immer die richtige Einstellung.

Python (ProcessPoolExecutor)

from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from office_oxide import Document

def process(path: Path) -> None:
    with Document.open(path) as doc:
        path.with_suffix(".md").write_text(doc.to_markdown())

paths = [p for p in Path("corpus").rglob("*")
         if p.suffix.lower() in {".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt"}]

with ProcessPoolExecutor(max_workers=8) as ex:
    for _ in ex.map(process, paths):
        pass

Das Python-Binding gibt den GIL während des nativen Parsings frei, sodass auch ein ThreadPoolExecutor funktioniert – Prozesse bieten jedoch bessere Isolation, falls ein einzelnes Dokument einen Panic auslöst.

Go (goroutine pool)

package main

import (
    "os"
    "path/filepath"
    "runtime"
    "strings"
    "sync"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

func main() {
    var paths []string
    filepath.Walk("corpus", func(p string, info os.FileInfo, err error) error {
        if err != nil || info.IsDir() { return err }
        ext := strings.ToLower(filepath.Ext(p))
        if ext == ".docx" || ext == ".xlsx" || ext == ".pptx" { paths = append(paths, p) }
        return nil
    })

    jobs := make(chan string)
    var wg sync.WaitGroup
    for i := 0; i < runtime.NumCPU(); i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            for path := range jobs {
                doc, err := officeoxide.Open(path)
                if err != nil { continue }
                md, _ := doc.ToMarkdown()
                os.WriteFile(strings.TrimSuffix(path, filepath.Ext(path))+".md", []byte(md), 0644)
                doc.Close()
            }
        }()
    }
    for _, p := range paths { jobs <- p }
    close(jobs)
    wg.Wait()
}

JavaScript (Promise.all)

import { readdirSync, statSync } from 'node:fs';
import { join, extname } from 'node:path';
import { Document } from 'office-oxide';

const exts = new Set(['.docx', '.xlsx', '.pptx', '.doc', '.xls', '.ppt']);

function* walk(dir) {
  for (const name of readdirSync(dir)) {
    const full = join(dir, name);
    if (statSync(full).isDirectory()) yield* walk(full);
    else yield full;
  }
}

const paths = [...walk('corpus')].filter(p => exts.has(extname(p).toLowerCase()));

// Limit concurrency
const CONCURRENCY = 8;
let i = 0;
async function worker() {
  while (i < paths.length) {
    const path = paths[i++];
    using doc = Document.open(path);
    // process doc.toMarkdown() ...
  }
}
await Promise.all(Array.from({ length: CONCURRENCY }, worker));

C# (Parallel.ForEach)

using OfficeOxide;

var exts = new HashSet<string> { ".docx", ".xlsx", ".pptx" };
var paths = Directory.EnumerateFiles("corpus", "*", SearchOption.AllDirectories)
    .Where(p => exts.Contains(Path.GetExtension(p).ToLowerInvariant()))
    .ToList();

Parallel.ForEach(paths, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount }, path =>
{
    try
    {
        using var doc = Document.Open(path);
        File.WriteAllText(Path.ChangeExtension(path, ".md"), doc.ToMarkdown());
    }
    catch (OfficeOxideException) { /* skip unreadable */ }
});

C (pthreads work item)

#include <stdio.h>
#include "office_oxide.h"

/* Each worker thread pulls a path and converts it; the C ABI is thread-safe
   across independent handles, so one OfficeDocumentHandle per thread is fine. */
static void convert_to_markdown(const char *path, const char *out_path) {
    int err = 0;
    OfficeDocumentHandle *doc = office_document_open(path, &err);
    if (!doc) return;                     /* skip unreadable */
    char *md = office_document_to_markdown(doc, &err);
    if (md) {
        FILE *f = fopen(out_path, "w");
        if (f) { fputs(md, f); fclose(f); }
        office_oxide_free_string(md);
    }
    office_document_free(doc);
}

Asynchron – wenn Dateien von außen kommen

Kommen Ihre Eingaben per HTTP, S3 oder einer Queue, lohnt sich asynchrones I/O, weil das Netzwerk die Parsing-Zeit bei weitem dominiert. Öffnen Sie Dokumente aus Bytes, niemals aus einem Pfad.

Rust (tokio)

use office_oxide::{Document, DocumentFormat};
use std::io::Cursor;

let bytes = reqwest::get(url).await?.bytes().await?;
let fmt = DocumentFormat::Docx;
// Move parsing to a blocking task — extraction is CPU-bound.
let text = tokio::task::spawn_blocking(move || -> office_oxide::Result<String> {
    let doc = Document::from_reader(Cursor::new(bytes.to_vec()), fmt)?;
    Ok(doc.plain_text())
}).await??;

Python (asyncio + aiohttp)

import asyncio, aiohttp
from office_oxide import Document

async def fetch_and_extract(session, url):
    async with session.get(url) as r:
        data = await r.read()
    fmt = url.rsplit(".", 1)[-1].lower()
    with Document.from_bytes(data, fmt) as doc:
        return doc.plain_text()

async def main(urls):
    async with aiohttp.ClientSession() as session:
        return await asyncio.gather(*(fetch_and_extract(session, u) for u in urls))

JavaScript (fetch + concurrency limit)

import { Document } from 'office-oxide';

async function fetchAndExtract(url) {
  const res = await fetch(url);
  const buf = Buffer.from(await res.arrayBuffer());
  const fmt = url.split('.').pop().toLowerCase();
  using doc = Document.fromBytes(buf, fmt);
  return doc.plainText();
}

const CONCURRENCY = 16;
const queue = [...urls];
const results = [];
await Promise.all(Array.from({ length: CONCURRENCY }, async () => {
  while (queue.length) {
    const url = queue.shift();
    results.push(await fetchAndExtract(url));
  }
}));

Go (HTTP fan-out)

package main

import (
    "io"
    "net/http"
    "strings"
    "sync"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

func fetchAndExtract(url string) (string, error) {
    resp, err := http.Get(url)
    if err != nil { return "", err }
    defer resp.Body.Close()
    data, err := io.ReadAll(resp.Body)
    if err != nil { return "", err }

    fmt := url[strings.LastIndex(url, ".")+1:]
    doc, err := officeoxide.OpenFromBytes(data, fmt)
    if err != nil { return "", err }
    defer doc.Close()
    return doc.PlainText()
}

func main() {
    urls := []string{ /* ... */ }
    sem := make(chan struct{}, 16) // concurrency cap
    var wg sync.WaitGroup
    for _, u := range urls {
        wg.Add(1)
        sem <- struct{}{}
        go func(url string) {
            defer wg.Done()
            defer func() { <-sem }()
            text, _ := fetchAndExtract(url)
            _ = text // process...
        }(u)
    }
    wg.Wait()
}

C# (HttpClient + async)

using OfficeOxide;

using var http = new HttpClient();

async Task<string> FetchAndExtract(string url)
{
    var data = await http.GetByteArrayAsync(url);
    var fmt = url[(url.LastIndexOf('.') + 1)..].ToLowerInvariant();
    using var doc = Document.FromBytes(data, fmt);
    return doc.PlainText();
}

// Limit concurrency with SemaphoreSlim
var sem = new SemaphoreSlim(16);
var tasks = urls.Select(async url =>
{
    await sem.WaitAsync();
    try { return await FetchAndExtract(url); }
    finally { sem.Release(); }
});
var results = await Task.WhenAll(tasks);

C (open from bytes)

#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
#include "office_oxide.h"

/* After fetching `data`/`len` over HTTP, parse without touching the filesystem. */
char *extract_from_bytes(const uint8_t *data, size_t len, const char *fmt) {
    int err = 0;
    OfficeDocumentHandle *doc =
        office_document_open_from_bytes(data, len, fmt, &err);   /* fmt: "docx" etc. */
    if (!doc) return NULL;
    char *text = office_document_plain_text(doc, &err);          /* free with free_string */
    office_document_free(doc);
    return text;
}

WASM (open from bytes)

import { WasmDocument } from 'office-oxide-wasm';

// Edge / browser: fetch bytes and extract with no server round-trip.
async function fetchAndExtract(url) {
  const data = new Uint8Array(await (await fetch(url)).arrayBuffer());
  const fmt = url.split('.').pop().toLowerCase();
  const doc = new WasmDocument(data, fmt);
  try {
    return doc.plainText();
  } finally {
    doc.free();
  }
}

Speicher-Tipps

Für sehr große XLSX-Dateien empfiehlt sich ein Rust-Build mit dem mmap-Feature (features = ["mmap"]) und der Aufruf von Document::open_mmap, um das gesamte Archiv nicht in den Heap kopieren zu müssen.
Pro Worker sollte immer nur ein Document gleichzeitig geöffnet sein. Jedes Handle hält die geparste Struktur im Speicher; das Schließen (Drop in Rust, Verlassen des with-Blocks in Python, close()/using in JS) gibt den Speicher frei.
Für LLM-Ingest in großem Maßstab ist to_markdown() gegenüber to_html() zu bevorzugen – Markdown erzeugt kleinere Ausgaben und einen besseren Durchsatz in nachgelagerten LLM-Systemen.

Weitere Ressourcen

Performance-Benchmarks — vollständige Zahlen inklusive p99
Office für RAG — RAG-spezifische Muster