What is the fastest Python library for DOCX, XLSX, and PPTX?

Office Oxide is the fastest. DOCX text extraction averages 0.8ms (vs 11.8ms for python-docx — 14× faster). XLSX averages 5.0ms (vs 94.5ms for openpyxl — 18× faster). PPTX averages 0.7ms (vs 32.5ms for python-pptx — 46× faster). Benchmarked on 6,062 real-world files.

Is Office Oxide free for commercial use?

Yes. Office Oxide is dual-licensed MIT OR Apache-2.0 — free for all uses including commercial products, SaaS, and proprietary software. No license fees, no sales calls, no AGPL or copyleft restrictions.

Does Office Oxide handle legacy .doc, .xls, and .ppt files?

Yes. Office Oxide reads all six formats: DOCX, XLSX, PPTX, plus legacy DOC, XLS, PPT. It is the only Rust or Python library that supports all three legacy formats without a JVM (Apache Tika) or external binaries (catdoc, antiword).

Can Office Oxide convert documents to Markdown?

Yes. Every supported format has built-in to_markdown() that preserves headings, tables, lists, and structure — ideal for LLM and RAG pipelines. No separate package needed.

How does Office Oxide compare to calamine and openpyxl for XLSX?

On 1,802 XLSX files: Office Oxide averages 5.0ms (97.8% pass rate). python-calamine averages 13.9ms (96.6%). openpyxl averages 94.5ms (96.2%). Office Oxide is 2.8× faster than calamine and 18× faster than openpyxl, with the highest pass rate.

Does Office Oxide work in the browser?

Yes. Office Oxide ships a WASM build (office-oxide-wasm on npm) that runs in any browser or bundler. Process Office documents client-side with no server round-trips — useful for privacy-sensitive workloads.

批量处理

Office Oxide的速度极快，绝大多数批量任务的瓶颈在于磁盘I/O而非解析。一个典型的Word文档提取只需0.8ms——这意味着单线程每秒可处理约1,000个文件。

本指南涵盖可扩展的各类模式：小型任务使用串行循环，中型任务使用线程池，从S3或HTTP流式读取时使用异步I/O。

串行循环 — 首选默认方案

对于本地磁盘上几千个文件以内的任务，简单的串行循环是最直接、往往也是最快的选择。可以避免启动工作线程的开销以及并行磁盘读取的竞争。

Rust

use std::path::Path;
use office_oxide::Document;
use walkdir::WalkDir;

for entry in WalkDir::new("corpus") {
    let entry = entry?;
    let path = entry.path();
    if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
        if matches!(ext.to_ascii_lowercase().as_str(),
                    "docx" | "xlsx" | "pptx" | "doc" | "xls" | "ppt") {
            let doc = Document::open(path)?;
            std::fs::write(path.with_extension("txt"), doc.plain_text())?;
        }
    }
}

Python

from pathlib import Path
from office_oxide import Document

for src in Path("corpus").rglob("*"):
    if src.suffix.lower() in {".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt"}:
        with Document.open(src) as doc:
            text = doc.plain_text()
        src.with_suffix(".txt").write_text(text)

JavaScript

import { readdirSync, statSync, writeFileSync } from 'node:fs';
import { join, extname } from 'node:path';
import { Document } from 'office-oxide';

const exts = new Set(['.docx', '.xlsx', '.pptx', '.doc', '.xls', '.ppt']);

function* walk(dir) {
  for (const name of readdirSync(dir)) {
    const full = join(dir, name);
    if (statSync(full).isDirectory()) yield* walk(full);
    else yield full;
  }
}

for (const src of walk('corpus')) {
  if (!exts.has(extname(src).toLowerCase())) continue;
  using doc = Document.open(src);
  writeFileSync(src.replace(/\.\w+$/, '.txt'), doc.plainText());
}

package main

import (
    "os"
    "path/filepath"
    "strings"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

var exts = map[string]bool{
    ".docx": true, ".xlsx": true, ".pptx": true,
    ".doc": true, ".xls": true, ".ppt": true,
}

func main() {
    filepath.Walk("corpus", func(path string, info os.FileInfo, err error) error {
        if err != nil || info.IsDir() { return err }
        if !exts[strings.ToLower(filepath.Ext(path))] { return nil }
        doc, err := officeoxide.Open(path)
        if err != nil { return nil } // skip unreadable
        defer doc.Close()
        text, _ := doc.PlainText()
        return os.WriteFile(strings.TrimSuffix(path, filepath.Ext(path))+".txt", []byte(text), 0644)
    })
}

using OfficeOxide;

var exts = new HashSet<string> { ".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt" };

foreach (var src in Directory.EnumerateFiles("corpus", "*", SearchOption.AllDirectories))
{
    if (!exts.Contains(Path.GetExtension(src).ToLowerInvariant())) continue;
    using var doc = Document.Open(src);
    File.WriteAllText(Path.ChangeExtension(src, ".txt"), doc.PlainText());
}

#include <stdio.h>
#include "office_oxide.h"

/* Walk a directory however you like (dirent/nftw); for each candidate path: */
static int extract_one(const char *path, const char *out_path) {
    int err = 0;
    OfficeDocumentHandle *doc = office_document_open(path, &err);
    if (!doc) return err;                 /* skip unreadable */
    char *text = office_document_plain_text(doc, &err);
    if (text) {
        FILE *f = fopen(out_path, "w");
        if (f) { fputs(text, f); fclose(f); }
        office_oxide_free_string(text);
    }
    office_document_free(doc);
    return 0;
}

WASM

import { WasmDocument } from 'office-oxide-wasm';

// Browser, user-uploaded files — runs entirely client-side, no server round-trip.
// <input type="file" multiple accept=".docx,.xlsx,.pptx,.doc,.xls,.ppt">
async function extractAll(fileList) {
  const results = [];
  for (const file of fileList) {
    const data = new Uint8Array(await file.arrayBuffer());
    const fmt = file.name.split('.').pop().toLowerCase();
    const doc = new WasmDocument(data, fmt);
    try {
      results.push({ name: file.name, text: doc.plainText() });
    } finally {
      doc.free();
    }
  }
  return results;
}

WASM完全在客户端运行——文件不会离开浏览器，非常适合隐私敏感的批量任务。

并行处理 — 大规模语料库

当文件数量达到数万个且拥有高速SSD时，并行处理能发挥优势。但要注意：工作线程过多会使磁盘饱和，反而降低吞吐量。

Rust (rayon)

use rayon::prelude::*;
use office_oxide::Document;

paths.par_iter().for_each(|path| {
    if let Ok(doc) = Document::open(path) {
        let _ = std::fs::write(path.with_extension("md"), doc.to_markdown());
    }
});

Rayon的默认线程数与CPU核心数匹配，几乎总是最佳设置。

Python (ProcessPoolExecutor)

from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from office_oxide import Document

def process(path: Path) -> None:
    with Document.open(path) as doc:
        path.with_suffix(".md").write_text(doc.to_markdown())

paths = [p for p in Path("corpus").rglob("*")
         if p.suffix.lower() in {".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt"}]

with ProcessPoolExecutor(max_workers=8) as ex:
    for _ in ex.map(process, paths):
        pass

Python绑定在原生解析期间会释放GIL，因此ThreadPoolExecutor也能工作——但如果单个文档崩溃，进程方式的隔离性更好。

Go (goroutine pool)

package main

import (
    "os"
    "path/filepath"
    "runtime"
    "strings"
    "sync"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

func main() {
    var paths []string
    filepath.Walk("corpus", func(p string, info os.FileInfo, err error) error {
        if err != nil || info.IsDir() { return err }
        ext := strings.ToLower(filepath.Ext(p))
        if ext == ".docx" || ext == ".xlsx" || ext == ".pptx" { paths = append(paths, p) }
        return nil
    })

    jobs := make(chan string)
    var wg sync.WaitGroup
    for i := 0; i < runtime.NumCPU(); i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            for path := range jobs {
                doc, err := officeoxide.Open(path)
                if err != nil { continue }
                md, _ := doc.ToMarkdown()
                os.WriteFile(strings.TrimSuffix(path, filepath.Ext(path))+".md", []byte(md), 0644)
                doc.Close()
            }
        }()
    }
    for _, p := range paths { jobs <- p }
    close(jobs)
    wg.Wait()
}

JavaScript (Promise.all)

import { readdirSync, statSync } from 'node:fs';
import { join, extname } from 'node:path';
import { Document } from 'office-oxide';

const exts = new Set(['.docx', '.xlsx', '.pptx', '.doc', '.xls', '.ppt']);

function* walk(dir) {
  for (const name of readdirSync(dir)) {
    const full = join(dir, name);
    if (statSync(full).isDirectory()) yield* walk(full);
    else yield full;
  }
}

const paths = [...walk('corpus')].filter(p => exts.has(extname(p).toLowerCase()));

// Limit concurrency
const CONCURRENCY = 8;
let i = 0;
async function worker() {
  while (i < paths.length) {
    const path = paths[i++];
    using doc = Document.open(path);
    // process doc.toMarkdown() ...
  }
}
await Promise.all(Array.from({ length: CONCURRENCY }, worker));

C# (Parallel.ForEach)

using OfficeOxide;

var exts = new HashSet<string> { ".docx", ".xlsx", ".pptx" };
var paths = Directory.EnumerateFiles("corpus", "*", SearchOption.AllDirectories)
    .Where(p => exts.Contains(Path.GetExtension(p).ToLowerInvariant()))
    .ToList();

Parallel.ForEach(paths, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount }, path =>
{
    try
    {
        using var doc = Document.Open(path);
        File.WriteAllText(Path.ChangeExtension(path, ".md"), doc.ToMarkdown());
    }
    catch (OfficeOxideException) { /* skip unreadable */ }
});

C (pthreads work item)

#include <stdio.h>
#include "office_oxide.h"

/* Each worker thread pulls a path and converts it; the C ABI is thread-safe
   across independent handles, so one OfficeDocumentHandle per thread is fine. */
static void convert_to_markdown(const char *path, const char *out_path) {
    int err = 0;
    OfficeDocumentHandle *doc = office_document_open(path, &err);
    if (!doc) return;                     /* skip unreadable */
    char *md = office_document_to_markdown(doc, &err);
    if (md) {
        FILE *f = fopen(out_path, "w");
        if (f) { fputs(md, f); fclose(f); }
        office_oxide_free_string(md);
    }
    office_document_free(doc);
}

异步处理 — 文件来自外部时

如果输入来自HTTP、S3或消息队列，异步I/O更具优势，因为网络延迟远超解析时间。请从字节流而非路径打开文档。

Rust (tokio)

use office_oxide::{Document, DocumentFormat};
use std::io::Cursor;

let bytes = reqwest::get(url).await?.bytes().await?;
let fmt = DocumentFormat::Docx;
// Move parsing to a blocking task — extraction is CPU-bound.
let text = tokio::task::spawn_blocking(move || -> office_oxide::Result<String> {
    let doc = Document::from_reader(Cursor::new(bytes.to_vec()), fmt)?;
    Ok(doc.plain_text())
}).await??;

Python (asyncio + aiohttp)

import asyncio, aiohttp
from office_oxide import Document

async def fetch_and_extract(session, url):
    async with session.get(url) as r:
        data = await r.read()
    fmt = url.rsplit(".", 1)[-1].lower()
    with Document.from_bytes(data, fmt) as doc:
        return doc.plain_text()

async def main(urls):
    async with aiohttp.ClientSession() as session:
        return await asyncio.gather(*(fetch_and_extract(session, u) for u in urls))

JavaScript (fetch + concurrency limit)

import { Document } from 'office-oxide';

async function fetchAndExtract(url) {
  const res = await fetch(url);
  const buf = Buffer.from(await res.arrayBuffer());
  const fmt = url.split('.').pop().toLowerCase();
  using doc = Document.fromBytes(buf, fmt);
  return doc.plainText();
}

const CONCURRENCY = 16;
const queue = [...urls];
const results = [];
await Promise.all(Array.from({ length: CONCURRENCY }, async () => {
  while (queue.length) {
    const url = queue.shift();
    results.push(await fetchAndExtract(url));
  }
}));

Go (HTTP fan-out)

package main

import (
    "io"
    "net/http"
    "strings"
    "sync"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

func fetchAndExtract(url string) (string, error) {
    resp, err := http.Get(url)
    if err != nil { return "", err }
    defer resp.Body.Close()
    data, err := io.ReadAll(resp.Body)
    if err != nil { return "", err }

    fmt := url[strings.LastIndex(url, ".")+1:]
    doc, err := officeoxide.OpenFromBytes(data, fmt)
    if err != nil { return "", err }
    defer doc.Close()
    return doc.PlainText()
}

func main() {
    urls := []string{ /* ... */ }
    sem := make(chan struct{}, 16) // concurrency cap
    var wg sync.WaitGroup
    for _, u := range urls {
        wg.Add(1)
        sem <- struct{}{}
        go func(url string) {
            defer wg.Done()
            defer func() { <-sem }()
            text, _ := fetchAndExtract(url)
            _ = text // process...
        }(u)
    }
    wg.Wait()
}

C# (HttpClient + async)

using OfficeOxide;

using var http = new HttpClient();

async Task<string> FetchAndExtract(string url)
{
    var data = await http.GetByteArrayAsync(url);
    var fmt = url[(url.LastIndexOf('.') + 1)..].ToLowerInvariant();
    using var doc = Document.FromBytes(data, fmt);
    return doc.PlainText();
}

// Limit concurrency with SemaphoreSlim
var sem = new SemaphoreSlim(16);
var tasks = urls.Select(async url =>
{
    await sem.WaitAsync();
    try { return await FetchAndExtract(url); }
    finally { sem.Release(); }
});
var results = await Task.WhenAll(tasks);

C (open from bytes)

#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
#include "office_oxide.h"

/* After fetching `data`/`len` over HTTP, parse without touching the filesystem. */
char *extract_from_bytes(const uint8_t *data, size_t len, const char *fmt) {
    int err = 0;
    OfficeDocumentHandle *doc =
        office_document_open_from_bytes(data, len, fmt, &err);   /* fmt: "docx" etc. */
    if (!doc) return NULL;
    char *text = office_document_plain_text(doc, &err);          /* free with free_string */
    office_document_free(doc);
    return text;
}

WASM (open from bytes)

import { WasmDocument } from 'office-oxide-wasm';

// Edge / browser: fetch bytes and extract with no server round-trip.
async function fetchAndExtract(url) {
  const data = new Uint8Array(await (await fetch(url)).arrayBuffer());
  const fmt = url.split('.').pop().toLowerCase();
  const doc = new WasmDocument(data, fmt);
  try {
    return doc.plainText();
  } finally {
    doc.free();
  }
}

内存使用建议

处理超大XLSX文件时，在Rust中启用mmap特性（features = ["mmap"]）并调用Document::open_mmap，可避免将整个压缩包复制到堆内存。
每个工作线程同一时间只保持一个Document打开。每个句柄在内存中持有已解析的结构；关闭它（Rust中的drop、Python中退出with块、JS中的close()/using）即可释放。
用于大规模LLM摄取时，优先使用to_markdown()而非to_html()——Markdown输出更小，可提升下游LLM的吞吐量。

另请参阅

性能基准测试 — 包含p99的完整数据
Office用于RAG — RAG专用模式