What is the fastest Python library for DOCX, XLSX, and PPTX?

Office Oxide is the fastest. DOCX text extraction averages 0.8ms (vs 11.8ms for python-docx — 14× faster). XLSX averages 5.0ms (vs 94.5ms for openpyxl — 18× faster). PPTX averages 0.7ms (vs 32.5ms for python-pptx — 46× faster). Benchmarked on 6,062 real-world files.

Is Office Oxide free for commercial use?

Yes. Office Oxide is dual-licensed MIT OR Apache-2.0 — free for all uses including commercial products, SaaS, and proprietary software. No license fees, no sales calls, no AGPL or copyleft restrictions.

Does Office Oxide handle legacy .doc, .xls, and .ppt files?

Yes. Office Oxide reads all six formats: DOCX, XLSX, PPTX, plus legacy DOC, XLS, PPT. It is the only Rust or Python library that supports all three legacy formats without a JVM (Apache Tika) or external binaries (catdoc, antiword).

Can Office Oxide convert documents to Markdown?

Yes. Every supported format has built-in to_markdown() that preserves headings, tables, lists, and structure — ideal for LLM and RAG pipelines. No separate package needed.

How does Office Oxide compare to calamine and openpyxl for XLSX?

On 1,802 XLSX files: Office Oxide averages 5.0ms (97.8% pass rate). python-calamine averages 13.9ms (96.6%). openpyxl averages 94.5ms (96.2%). Office Oxide is 2.8× faster than calamine and 18× faster than openpyxl, with the highest pass rate.

Does Office Oxide work in the browser?

Yes. Office Oxide ships a WASM build (office-oxide-wasm on npm) that runs in any browser or bundler. Process Office documents client-side with no server round-trips — useful for privacy-sensitive workloads.

Пакетная обработка

Office Oxide настолько быстрый, что для большинства пакетных задач узким местом становится disk I/O, а не парсинг. Типичный Word-документ извлекается за 0.8 мс — то есть один поток съедает ~1 000 файлов в секунду.

Этот гайд про паттерны, которые масштабируются: последовательный цикл для маленьких задач, пул потоков для средних, async I/O — когда стримите из S3 или HTTP.

Последовательный цикл — правильный дефолт

До нескольких тысяч файлов на локальном диске простой sequential-цикл — самый простой и часто самый быстрый выбор. Вы избегаете накладных расходов на запуск воркеров и конкуренции за параллельные дисковые чтения.

Python

from pathlib import Path
from office_oxide import Document

for src in Path("corpus").rglob("*"):
    if src.suffix.lower() in {".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt"}:
        with Document.open(src) as doc:
            text = doc.plain_text()
        src.with_suffix(".txt").write_text(text)

Rust

use std::path::Path;
use office_oxide::Document;
use walkdir::WalkDir;

for entry in WalkDir::new("corpus") {
    let entry = entry?;
    let path = entry.path();
    if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
        if matches!(ext.to_ascii_lowercase().as_str(),
                    "docx" | "xlsx" | "pptx" | "doc" | "xls" | "ppt") {
            let doc = Document::open(path)?;
            std::fs::write(path.with_extension("txt"), doc.plain_text())?;
        }
    }
}

JavaScript

import { readdirSync, statSync, writeFileSync } from 'node:fs';
import { join, extname } from 'node:path';
import { Document } from 'office-oxide';

const exts = new Set(['.docx', '.xlsx', '.pptx', '.doc', '.xls', '.ppt']);

function* walk(dir) {
  for (const name of readdirSync(dir)) {
    const full = join(dir, name);
    if (statSync(full).isDirectory()) yield* walk(full);
    else yield full;
  }
}

for (const src of walk('corpus')) {
  if (!exts.has(extname(src).toLowerCase())) continue;
  using doc = Document.open(src);
  writeFileSync(src.replace(/\.\w+$/, '.txt'), doc.plainText());
}

WASM (браузер, файлы от пользователя)

import { WasmDocument } from 'office-oxide-wasm';

// <input type="file" multiple accept=".docx,.xlsx,.pptx,.doc,.xls,.ppt">
async function extractAll(fileList) {
  const results = [];
  for (const file of fileList) {
    const data = new Uint8Array(await file.arrayBuffer());
    const fmt = file.name.split('.').pop().toLowerCase();
    const doc = new WasmDocument(data, fmt);
    try {
      results.push({ name: file.name, text: doc.plainText() });
    } finally {
      doc.free();
    }
  }
  return results;
}

Работает целиком на клиенте — файлы не покидают браузер, идеально для приватных пакетных задач.

Go

package main

import (
    "os"
    "path/filepath"
    "strings"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

var exts = map[string]bool{
    ".docx": true, ".xlsx": true, ".pptx": true,
    ".doc": true, ".xls": true, ".ppt": true,
}

func main() {
    filepath.Walk("corpus", func(path string, info os.FileInfo, err error) error {
        if err != nil || info.IsDir() { return err }
        if !exts[strings.ToLower(filepath.Ext(path))] { return nil }
        doc, err := officeoxide.Open(path)
        if err != nil { return nil } // пропускаем нечитаемые
        defer doc.Close()
        text, _ := doc.PlainText()
        return os.WriteFile(strings.TrimSuffix(path, filepath.Ext(path))+".txt", []byte(text), 0644)
    })
}

C#

using OfficeOxide;

var exts = new HashSet<string> { ".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt" };

foreach (var src in Directory.EnumerateFiles("corpus", "*", SearchOption.AllDirectories))
{
    if (!exts.Contains(Path.GetExtension(src).ToLowerInvariant())) continue;
    using var doc = Document.Open(src);
    File.WriteAllText(Path.ChangeExtension(src, ".txt"), doc.PlainText());
}

Параллельно — для крупных корпусов

Когда у вас десятки тысяч файлов и быстрый SSD, параллелизм помогает. Аккуратно: слишком много воркеров засат́урят диск и убьют пропускную способность.

Python (ProcessPoolExecutor)

from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from office_oxide import Document

def process(path: Path) -> None:
    with Document.open(path) as doc:
        path.with_suffix(".md").write_text(doc.to_markdown())

paths = [p for p in Path("corpus").rglob("*")
         if p.suffix.lower() in {".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt"}]

with ProcessPoolExecutor(max_workers=8) as ex:
    for _ in ex.map(process, paths):
        pass

Python-привязка отпускает GIL во время нативного парсинга, так что ThreadPoolExecutor тоже работает — но процессы дают лучшую изоляцию, если один документ запаникует.

Rust (rayon)

use rayon::prelude::*;
use office_oxide::Document;

paths.par_iter().for_each(|path| {
    if let Ok(doc) = Document::open(path) {
        let _ = std::fs::write(path.with_extension("md"), doc.to_markdown());
    }
});

Дефолтное число потоков rayon совпадает с числом CPU — почти всегда правильная настройка.

Go (goroutine pool)

package main

import (
    "os"
    "path/filepath"
    "runtime"
    "strings"
    "sync"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

func main() {
    var paths []string
    filepath.Walk("corpus", func(p string, info os.FileInfo, err error) error {
        if err != nil || info.IsDir() { return err }
        ext := strings.ToLower(filepath.Ext(p))
        if ext == ".docx" || ext == ".xlsx" || ext == ".pptx" { paths = append(paths, p) }
        return nil
    })

    jobs := make(chan string)
    var wg sync.WaitGroup
    for i := 0; i < runtime.NumCPU(); i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            for path := range jobs {
                doc, err := officeoxide.Open(path)
                if err != nil { continue }
                md, _ := doc.ToMarkdown()
                os.WriteFile(strings.TrimSuffix(path, filepath.Ext(path))+".md", []byte(md), 0644)
                doc.Close()
            }
        }()
    }
    for _, p := range paths { jobs <- p }
    close(jobs)
    wg.Wait()
}

C# (Parallel.ForEach)

using OfficeOxide;

var exts = new HashSet<string> { ".docx", ".xlsx", ".pptx" };
var paths = Directory.EnumerateFiles("corpus", "*", SearchOption.AllDirectories)
    .Where(p => exts.Contains(Path.GetExtension(p).ToLowerInvariant()))
    .ToList();

Parallel.ForEach(paths, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount }, path =>
{
    try
    {
        using var doc = Document.Open(path);
        File.WriteAllText(Path.ChangeExtension(path, ".md"), doc.ToMarkdown());
    }
    catch (OfficeOxideException) { /* пропускаем нечитаемые */ }
});

JavaScript (Promise.all)

import { readdirSync, statSync } from 'node:fs';
import { join, extname } from 'node:path';
import { Document } from 'office-oxide';

const exts = new Set(['.docx', '.xlsx', '.pptx', '.doc', '.xls', '.ppt']);

function* walk(dir) {
  for (const name of readdirSync(dir)) {
    const full = join(dir, name);
    if (statSync(full).isDirectory()) yield* walk(full);
    else yield full;
  }
}

const paths = [...walk('corpus')].filter(p => exts.has(extname(p).toLowerCase()));

// Ограничиваем параллелизм
const CONCURRENCY = 8;
let i = 0;
async function worker() {
  while (i < paths.length) {
    const path = paths[i++];
    using doc = Document.open(path);
    // обработать doc.toMarkdown() ...
  }
}
await Promise.all(Array.from({ length: CONCURRENCY }, worker));

Async — когда файлы приходят извне

Если входы приходят из HTTP, S3 или очереди — async I/O выигрывает, потому что сеть доминирует над временем парсинга. Открывайте из байтов, никогда не из пути.

Python (asyncio + aiohttp)

import asyncio, aiohttp
from office_oxide import Document

async def fetch_and_extract(session, url):
    async with session.get(url) as r:
        data = await r.read()
    fmt = url.rsplit(".", 1)[-1].lower()
    with Document.from_bytes(data, fmt) as doc:
        return doc.plain_text()

async def main(urls):
    async with aiohttp.ClientSession() as session:
        return await asyncio.gather(*(fetch_and_extract(session, u) for u in urls))

Rust (tokio)

use office_oxide::{Document, DocumentFormat};
use std::io::Cursor;

let bytes = reqwest::get(url).await?.bytes().await?;
let fmt = DocumentFormat::Docx;
// Перекидываем парсинг в blocking-таск — извлечение CPU-bound
let text = tokio::task::spawn_blocking(move || -> office_oxide::Result<String> {
    let doc = Document::from_reader(Cursor::new(bytes.to_vec()), fmt)?;
    Ok(doc.plain_text())
}).await??;

JavaScript (fetch + concurrency limit)

import { Document } from 'office-oxide';

async function fetchAndExtract(url) {
  const res = await fetch(url);
  const buf = Buffer.from(await res.arrayBuffer());
  const fmt = url.split('.').pop().toLowerCase();
  using doc = Document.fromBytes(buf, fmt);
  return doc.plainText();
}

const CONCURRENCY = 16;
const queue = [...urls];
const results = [];
await Promise.all(Array.from({ length: CONCURRENCY }, async () => {
  while (queue.length) {
    const url = queue.shift();
    results.push(await fetchAndExtract(url));
  }
}));

Go (HTTP fan-out)

package main

import (
    "io"
    "net/http"
    "strings"
    "sync"

    officeoxide "github.com/yfedoseev/office_oxide/go"
)

func fetchAndExtract(url string) (string, error) {
    resp, err := http.Get(url)
    if err != nil { return "", err }
    defer resp.Body.Close()
    data, err := io.ReadAll(resp.Body)
    if err != nil { return "", err }

    fmt := url[strings.LastIndex(url, ".")+1:]
    doc, err := officeoxide.OpenFromBytes(data, fmt)
    if err != nil { return "", err }
    defer doc.Close()
    return doc.PlainText()
}

func main() {
    urls := []string{ /* ... */ }
    sem := make(chan struct{}, 16) // лимит параллелизма
    var wg sync.WaitGroup
    for _, u := range urls {
        wg.Add(1)
        sem <- struct{}{}
        go func(url string) {
            defer wg.Done()
            defer func() { <-sem }()
            text, _ := fetchAndExtract(url)
            _ = text // обработать...
        }(u)
    }
    wg.Wait()
}

C# (HttpClient + async)

using OfficeOxide;

using var http = new HttpClient();

async Task<string> FetchAndExtract(string url)
{
    var data = await http.GetByteArrayAsync(url);
    var fmt = url[(url.LastIndexOf('.') + 1)..].ToLowerInvariant();
    using var doc = Document.FromBytes(data, fmt);
    return doc.PlainText();
}

// Ограничиваем параллелизм через SemaphoreSlim
var sem = new SemaphoreSlim(16);
var tasks = urls.Select(async url =>
{
    await sem.WaitAsync();
    try { return await FetchAndExtract(url); }
    finally { sem.Release(); }
});
var results = await Task.WhenAll(tasks);

Подсказки по памяти

Для очень больших XLSX в Rust собирайтесь с фичей mmap (features = ["mmap"]) и зовите Document::open_mmap, чтобы не копировать архив целиком в кучу.
Держите по одному открытому Document на воркер за раз. Каждый handle хранит распарсенную структуру в памяти; закрытие (drop в Rust, выход из with в Python, close()/using в JS) освобождает её.
Для LLM-ингеста в больших масштабах предпочитайте to_markdown() to_html() — Markdown даёт меньший вывод и лучшую LLM-пропускную способность дальше.

Смотрите также

Бенчмарки производительности — полные числа, включая p99
Office для RAG — RAG-специфичные паттерны