What is the fastest Python library for DOCX, XLSX, and PPTX?

Office Oxide is the fastest. DOCX text extraction averages 0.8ms (vs 11.8ms for python-docx — 14× faster). XLSX averages 5.0ms (vs 94.5ms for openpyxl — 18× faster). PPTX averages 0.7ms (vs 32.5ms for python-pptx — 46× faster). Benchmarked on 6,062 real-world files.

Is Office Oxide free for commercial use?

Yes. Office Oxide is dual-licensed MIT OR Apache-2.0 — free for all uses including commercial products, SaaS, and proprietary software. No license fees, no sales calls, no AGPL or copyleft restrictions.

Does Office Oxide handle legacy .doc, .xls, and .ppt files?

Yes. Office Oxide reads all six formats: DOCX, XLSX, PPTX, plus legacy DOC, XLS, PPT. It is the only Rust or Python library that supports all three legacy formats without a JVM (Apache Tika) or external binaries (catdoc, antiword).

Can Office Oxide convert documents to Markdown?

Yes. Every supported format has built-in to_markdown() that preserves headings, tables, lists, and structure — ideal for LLM and RAG pipelines. No separate package needed.

How does Office Oxide compare to calamine and openpyxl for XLSX?

On 1,802 XLSX files: Office Oxide averages 5.0ms (97.8% pass rate). python-calamine averages 13.9ms (96.6%). openpyxl averages 94.5ms (96.2%). Office Oxide is 2.8× faster than calamine and 18× faster than openpyxl, with the highest pass rate.

Does Office Oxide work in the browser?

Yes. Office Oxide ships a WASM build (office-oxide-wasm on npm) that runs in any browser or bundler. Process Office documents client-side with no server round-trips — useful for privacy-sensitive workloads.

バッチ処理

Office Oxideはほとんどのバッチ処理においてボトルネックがパースではなくディスクI/Oになるほど高速です。一般的なWordドキュメントの抽出時間は0.8ms——つまりシングルスレッドで毎秒約1,000ファイルを処理できます。

このガイドでは、スケールする各種パターンを紹介します：小規模ジョブ向けのシリアルループ、中規模向けのスレッドプール、S3やHTTPからストリームする場合の非同期I/O。

シリアルループ — まず検討すべき選択肢

ローカルディスク上の数千ファイル程度なら、シンプルなシリアルループが最もわかりやすく、多くの場合で最速の選択肢です。ワーカーの起動コストや並列ディスク読み込みの競合を回避できます。

Rust

use std::path::Path;
use office_oxide::Document;
use walkdir::WalkDir;

for entry in WalkDir::new("corpus") {
    let entry = entry?;
    let path = entry.path();
    if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
        if matches!(ext.to_ascii_lowercase().as_str(),
                    "docx" | "xlsx" | "pptx" | "doc" | "xls" | "ppt") {
            let doc = Document::open(path)?;
            std::fs::write(path.with_extension("txt"), doc.plain_text())?;
        }
    }
}

Python

from pathlib import Path
from office_oxide import Document

for src in Path("corpus").rglob("*"):
    if src.suffix.lower() in {".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt"}:
        with Document.open(src) as doc:
            text = doc.plain_text()
        src.with_suffix(".txt").write_text(text)

JavaScript

import { readdirSync, statSync, writeFileSync } from 'node:fs';
import { join, extname } from 'node:path';
import { Document } from 'office-oxide';

const exts = new Set(['.docx', '.xlsx', '.pptx', '.doc', '.xls', '.ppt']);

function* walk(dir) {
  for (const name of readdirSync(dir)) {
    const full = join(dir, name);
    if (statSync(full).isDirectory()) yield* walk(full);
    else yield full;
  }
}

for (const src of walk('corpus')) {
  if (!exts.has(extname(src).toLowerCase())) continue;
  using doc = Document.open(src);
  writeFileSync(src.replace(/\.\w+$/, '.txt'), doc.plainText());
}

package main

import (
    "os"
    "path/filepath"
    "strings"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

var exts = map[string]bool{
    ".docx": true, ".xlsx": true, ".pptx": true,
    ".doc": true, ".xls": true, ".ppt": true,
}

func main() {
    filepath.Walk("corpus", func(path string, info os.FileInfo, err error) error {
        if err != nil || info.IsDir() { return err }
        if !exts[strings.ToLower(filepath.Ext(path))] { return nil }
        doc, err := officeoxide.Open(path)
        if err != nil { return nil } // skip unreadable
        defer doc.Close()
        text, _ := doc.PlainText()
        return os.WriteFile(strings.TrimSuffix(path, filepath.Ext(path))+".txt", []byte(text), 0644)
    })
}

using OfficeOxide;

var exts = new HashSet<string> { ".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt" };

foreach (var src in Directory.EnumerateFiles("corpus", "*", SearchOption.AllDirectories))
{
    if (!exts.Contains(Path.GetExtension(src).ToLowerInvariant())) continue;
    using var doc = Document.Open(src);
    File.WriteAllText(Path.ChangeExtension(src, ".txt"), doc.PlainText());
}

#include <stdio.h>
#include "office_oxide.h"

/* Walk a directory however you like (dirent/nftw); for each candidate path: */
static int extract_one(const char *path, const char *out_path) {
    int err = 0;
    OfficeDocumentHandle *doc = office_document_open(path, &err);
    if (!doc) return err;                 /* skip unreadable */
    char *text = office_document_plain_text(doc, &err);
    if (text) {
        FILE *f = fopen(out_path, "w");
        if (f) { fputs(text, f); fclose(f); }
        office_oxide_free_string(text);
    }
    office_document_free(doc);
    return 0;
}

WASM

import { WasmDocument } from 'office-oxide-wasm';

// Browser, user-uploaded files — runs entirely client-side, no server round-trip.
// <input type="file" multiple accept=".docx,.xlsx,.pptx,.doc,.xls,.ppt">
async function extractAll(fileList) {
  const results = [];
  for (const file of fileList) {
    const data = new Uint8Array(await file.arrayBuffer());
    const fmt = file.name.split('.').pop().toLowerCase();
    const doc = new WasmDocument(data, fmt);
    try {
      results.push({ name: file.name, text: doc.plainText() });
    } finally {
      doc.free();
    }
  }
  return results;
}

WASMは完全にクライアントサイドで動作します——ファイルはブラウザの外に出ないため、プライバシーが重要なバッチ処理に最適です。

並列処理 — 大規模コーパス向け

数万件のファイルと高速SSDがある場合、並列処理が効果を発揮します。ただしワーカー数を増やしすぎるとディスクが飽和してスループットが低下するため注意が必要です。

Rust (rayon)

use rayon::prelude::*;
use office_oxide::Document;

paths.par_iter().for_each(|path| {
    if let Ok(doc) = Document::open(path) {
        let _ = std::fs::write(path.with_extension("md"), doc.to_markdown());
    }
});

Rayonのデフォルトスレッド数はCPUコア数に合わせて自動設定され、ほぼ常に最適な値です。

Python (ProcessPoolExecutor)

from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from office_oxide import Document

def process(path: Path) -> None:
    with Document.open(path) as doc:
        path.with_suffix(".md").write_text(doc.to_markdown())

paths = [p for p in Path("corpus").rglob("*")
         if p.suffix.lower() in {".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt"}]

with ProcessPoolExecutor(max_workers=8) as ex:
    for _ in ex.map(process, paths):
        pass

PythonバインディングはネイティブパースのあいだGILを解放するためThreadPoolExecutorでも動作しますが、単一のドキュメントがパニックした場合の分離という意味でプロセスの方が安全です。

Go (goroutine pool)

package main

import (
    "os"
    "path/filepath"
    "runtime"
    "strings"
    "sync"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

func main() {
    var paths []string
    filepath.Walk("corpus", func(p string, info os.FileInfo, err error) error {
        if err != nil || info.IsDir() { return err }
        ext := strings.ToLower(filepath.Ext(p))
        if ext == ".docx" || ext == ".xlsx" || ext == ".pptx" { paths = append(paths, p) }
        return nil
    })

    jobs := make(chan string)
    var wg sync.WaitGroup
    for i := 0; i < runtime.NumCPU(); i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            for path := range jobs {
                doc, err := officeoxide.Open(path)
                if err != nil { continue }
                md, _ := doc.ToMarkdown()
                os.WriteFile(strings.TrimSuffix(path, filepath.Ext(path))+".md", []byte(md), 0644)
                doc.Close()
            }
        }()
    }
    for _, p := range paths { jobs <- p }
    close(jobs)
    wg.Wait()
}

JavaScript (Promise.all)

import { readdirSync, statSync } from 'node:fs';
import { join, extname } from 'node:path';
import { Document } from 'office-oxide';

const exts = new Set(['.docx', '.xlsx', '.pptx', '.doc', '.xls', '.ppt']);

function* walk(dir) {
  for (const name of readdirSync(dir)) {
    const full = join(dir, name);
    if (statSync(full).isDirectory()) yield* walk(full);
    else yield full;
  }
}

const paths = [...walk('corpus')].filter(p => exts.has(extname(p).toLowerCase()));

// Limit concurrency
const CONCURRENCY = 8;
let i = 0;
async function worker() {
  while (i < paths.length) {
    const path = paths[i++];
    using doc = Document.open(path);
    // process doc.toMarkdown() ...
  }
}
await Promise.all(Array.from({ length: CONCURRENCY }, worker));

C# (Parallel.ForEach)

using OfficeOxide;

var exts = new HashSet<string> { ".docx", ".xlsx", ".pptx" };
var paths = Directory.EnumerateFiles("corpus", "*", SearchOption.AllDirectories)
    .Where(p => exts.Contains(Path.GetExtension(p).ToLowerInvariant()))
    .ToList();

Parallel.ForEach(paths, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount }, path =>
{
    try
    {
        using var doc = Document.Open(path);
        File.WriteAllText(Path.ChangeExtension(path, ".md"), doc.ToMarkdown());
    }
    catch (OfficeOxideException) { /* skip unreadable */ }
});

C (pthreads work item)

#include <stdio.h>
#include "office_oxide.h"

/* Each worker thread pulls a path and converts it; the C ABI is thread-safe
   across independent handles, so one OfficeDocumentHandle per thread is fine. */
static void convert_to_markdown(const char *path, const char *out_path) {
    int err = 0;
    OfficeDocumentHandle *doc = office_document_open(path, &err);
    if (!doc) return;                     /* skip unreadable */
    char *md = office_document_to_markdown(doc, &err);
    if (md) {
        FILE *f = fopen(out_path, "w");
        if (f) { fputs(md, f); fclose(f); }
        office_oxide_free_string(md);
    }
    office_document_free(doc);
}

非同期処理 — ファイルが外部から来る場合

入力がHTTP、S3、またはキューからくる場合、ネットワーク待ちがパース時間を大きく上回るため非同期I/Oが有効です。パスではなくバイト列から開くようにしてください。

Rust (tokio)

use office_oxide::{Document, DocumentFormat};
use std::io::Cursor;

let bytes = reqwest::get(url).await?.bytes().await?;
let fmt = DocumentFormat::Docx;
// Move parsing to a blocking task — extraction is CPU-bound.
let text = tokio::task::spawn_blocking(move || -> office_oxide::Result<String> {
    let doc = Document::from_reader(Cursor::new(bytes.to_vec()), fmt)?;
    Ok(doc.plain_text())
}).await??;

Python (asyncio + aiohttp)

import asyncio, aiohttp
from office_oxide import Document

async def fetch_and_extract(session, url):
    async with session.get(url) as r:
        data = await r.read()
    fmt = url.rsplit(".", 1)[-1].lower()
    with Document.from_bytes(data, fmt) as doc:
        return doc.plain_text()

async def main(urls):
    async with aiohttp.ClientSession() as session:
        return await asyncio.gather(*(fetch_and_extract(session, u) for u in urls))

JavaScript (fetch + concurrency limit)

import { Document } from 'office-oxide';

async function fetchAndExtract(url) {
  const res = await fetch(url);
  const buf = Buffer.from(await res.arrayBuffer());
  const fmt = url.split('.').pop().toLowerCase();
  using doc = Document.fromBytes(buf, fmt);
  return doc.plainText();
}

const CONCURRENCY = 16;
const queue = [...urls];
const results = [];
await Promise.all(Array.from({ length: CONCURRENCY }, async () => {
  while (queue.length) {
    const url = queue.shift();
    results.push(await fetchAndExtract(url));
  }
}));

Go (HTTP fan-out)

package main

import (
    "io"
    "net/http"
    "strings"
    "sync"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

func fetchAndExtract(url string) (string, error) {
    resp, err := http.Get(url)
    if err != nil { return "", err }
    defer resp.Body.Close()
    data, err := io.ReadAll(resp.Body)
    if err != nil { return "", err }

    fmt := url[strings.LastIndex(url, ".")+1:]
    doc, err := officeoxide.OpenFromBytes(data, fmt)
    if err != nil { return "", err }
    defer doc.Close()
    return doc.PlainText()
}

func main() {
    urls := []string{ /* ... */ }
    sem := make(chan struct{}, 16) // concurrency cap
    var wg sync.WaitGroup
    for _, u := range urls {
        wg.Add(1)
        sem <- struct{}{}
        go func(url string) {
            defer wg.Done()
            defer func() { <-sem }()
            text, _ := fetchAndExtract(url)
            _ = text // process...
        }(u)
    }
    wg.Wait()
}

C# (HttpClient + async)

using OfficeOxide;

using var http = new HttpClient();

async Task<string> FetchAndExtract(string url)
{
    var data = await http.GetByteArrayAsync(url);
    var fmt = url[(url.LastIndexOf('.') + 1)..].ToLowerInvariant();
    using var doc = Document.FromBytes(data, fmt);
    return doc.PlainText();
}

// Limit concurrency with SemaphoreSlim
var sem = new SemaphoreSlim(16);
var tasks = urls.Select(async url =>
{
    await sem.WaitAsync();
    try { return await FetchAndExtract(url); }
    finally { sem.Release(); }
});
var results = await Task.WhenAll(tasks);

C (open from bytes)

#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
#include "office_oxide.h"

/* After fetching `data`/`len` over HTTP, parse without touching the filesystem. */
char *extract_from_bytes(const uint8_t *data, size_t len, const char *fmt) {
    int err = 0;
    OfficeDocumentHandle *doc =
        office_document_open_from_bytes(data, len, fmt, &err);   /* fmt: "docx" etc. */
    if (!doc) return NULL;
    char *text = office_document_plain_text(doc, &err);          /* free with free_string */
    office_document_free(doc);
    return text;
}

WASM (open from bytes)

import { WasmDocument } from 'office-oxide-wasm';

// Edge / browser: fetch bytes and extract with no server round-trip.
async function fetchAndExtract(url) {
  const data = new Uint8Array(await (await fetch(url)).arrayBuffer());
  const fmt = url.split('.').pop().toLowerCase();
  const doc = new WasmDocument(data, fmt);
  try {
    return doc.plainText();
  } finally {
    doc.free();
  }
}

メモリに関するヒント

非常に大きなXLSXファイルを扱う場合は、Rustでmmapフィーチャ（features = ["mmap"]）を有効にしてDocument::open_mmapを呼び出すことで、アーカイブ全体をヒープにコピーせずに済みます。
ワーカーごとにDocumentは一度に1つだけ開くようにしてください。各ハンドルはパース済み構造をメモリに保持しており、クローズ（Rustでのdrop、Pythonでのwithブロック終了、JSでのclose()/using）すると解放されます。
LLMへの大規模取り込みにはto_html()よりto_markdown()を優先してください——Markdownは出力サイズが小さく、下流でのLLMスループットが向上します。

バッチ処理

シリアルループ — まず検討すべき選択肢

並列処理 — 大規模コーパス向け

非同期処理 — ファイルが外部から来る場合

メモリに関するヒント

関連ドキュメント