批处理
Office Oxide 足够快,所以大多数批处理任务的瓶颈是磁盘 I/O,不是解析。一份典型的 Word 文档抽取要 0.8ms — 也就是单线程每秒能啃 ~1,000 份文件。
本指南讲述能扩展的几种模式:小任务用串行循环,中等任务用线程池,从 S3 或 HTTP 流式读取时用异步 I/O。
串行循环 — 正确的默认
在本地磁盘上几千份以内,朴素的串行循环最简单,往往也最快。你避免了启动工作进程的开销和并行磁盘读的争用。
Python
from pathlib import Path
from office_oxide import Document
for src in Path("corpus").rglob("*"):
if src.suffix.lower() in {".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt"}:
with Document.open(src) as doc:
text = doc.plain_text()
src.with_suffix(".txt").write_text(text)
Rust
use std::path::Path;
use office_oxide::Document;
use walkdir::WalkDir;
for entry in WalkDir::new("corpus") {
let entry = entry?;
let path = entry.path();
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
if matches!(ext.to_ascii_lowercase().as_str(),
"docx" | "xlsx" | "pptx" | "doc" | "xls" | "ppt") {
let doc = Document::open(path)?;
std::fs::write(path.with_extension("txt"), doc.plain_text())?;
}
}
}
JavaScript
import { readdirSync, statSync, writeFileSync } from 'node:fs';
import { join, extname } from 'node:path';
import { Document } from 'office-oxide';
const exts = new Set(['.docx', '.xlsx', '.pptx', '.doc', '.xls', '.ppt']);
function* walk(dir) {
for (const name of readdirSync(dir)) {
const full = join(dir, name);
if (statSync(full).isDirectory()) yield* walk(full);
else yield full;
}
}
for (const src of walk('corpus')) {
if (!exts.has(extname(src).toLowerCase())) continue;
using doc = Document.open(src);
writeFileSync(src.replace(/\.\w+$/, '.txt'), doc.plainText());
}
WASM(浏览器,用户上传的文件)
import { WasmDocument } from 'office-oxide-wasm';
// <input type="file" multiple accept=".docx,.xlsx,.pptx,.doc,.xls,.ppt">
async function extractAll(fileList) {
const results = [];
for (const file of fileList) {
const data = new Uint8Array(await file.arrayBuffer());
const fmt = file.name.split('.').pop().toLowerCase();
const doc = new WasmDocument(data, fmt);
try {
results.push({ name: file.name, text: doc.plainText() });
} finally {
doc.free();
}
}
return results;
}
完全在客户端运行 — 文件从不离开浏览器,非常适合对隐私敏感的批处理任务。
Go
package main
import (
"os"
"path/filepath"
"strings"
officeoxide "github.com/yfedoseev/office_oxide/go"
)
var exts = map[string]bool{
".docx": true, ".xlsx": true, ".pptx": true,
".doc": true, ".xls": true, ".ppt": true,
}
func main() {
filepath.Walk("corpus", func(path string, info os.FileInfo, err error) error {
if err != nil || info.IsDir() { return err }
if !exts[strings.ToLower(filepath.Ext(path))] { return nil }
doc, err := officeoxide.Open(path)
if err != nil { return nil } // skip unreadable
defer doc.Close()
text, _ := doc.PlainText()
return os.WriteFile(strings.TrimSuffix(path, filepath.Ext(path))+".txt", []byte(text), 0644)
})
}
C#
using OfficeOxide;
var exts = new HashSet<string> { ".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt" };
foreach (var src in Directory.EnumerateFiles("corpus", "*", SearchOption.AllDirectories))
{
if (!exts.Contains(Path.GetExtension(src).ToLowerInvariant())) continue;
using var doc = Document.Open(src);
File.WriteAllText(Path.ChangeExtension(src, ".txt"), doc.PlainText());
}
并行 — 用于大型语料
当你有几万份文件和快速 SSD 时,并行才有意义。当心:工作进程太多会让磁盘饱和、反而降吞吐。
Python(ProcessPoolExecutor)
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from office_oxide import Document
def process(path: Path) -> None:
with Document.open(path) as doc:
path.with_suffix(".md").write_text(doc.to_markdown())
paths = [p for p in Path("corpus").rglob("*")
if p.suffix.lower() in {".docx", ".xlsx", ".pptx", ".doc", ".xls", ".ppt"}]
with ProcessPoolExecutor(max_workers=8) as ex:
for _ in ex.map(process, paths):
pass
Python 绑定在原生解析期间会释放 GIL,所以 ThreadPoolExecutor 也能用 — 但单个文档触发 panic(恐慌)时,进程隔离更稳妥。
Rust(rayon)
use rayon::prelude::*;
use office_oxide::Document;
paths.par_iter().for_each(|path| {
if let Ok(doc) = Document::open(path) {
let _ = std::fs::write(path.with_extension("md"), doc.to_markdown());
}
});
rayon 默认线程数与 CPU 数一致 — 几乎总是正确选择。
Go (goroutine pool)
package main
import (
"os"
"path/filepath"
"runtime"
"strings"
"sync"
officeoxide "github.com/yfedoseev/office_oxide/go"
)
func main() {
var paths []string
filepath.Walk("corpus", func(p string, info os.FileInfo, err error) error {
if err != nil || info.IsDir() { return err }
ext := strings.ToLower(filepath.Ext(p))
if ext == ".docx" || ext == ".xlsx" || ext == ".pptx" { paths = append(paths, p) }
return nil
})
jobs := make(chan string)
var wg sync.WaitGroup
for i := 0; i < runtime.NumCPU(); i++ {
wg.Add(1)
go func() {
defer wg.Done()
for path := range jobs {
doc, err := officeoxide.Open(path)
if err != nil { continue }
md, _ := doc.ToMarkdown()
os.WriteFile(strings.TrimSuffix(path, filepath.Ext(path))+".md", []byte(md), 0644)
doc.Close()
}
}()
}
for _, p := range paths { jobs <- p }
close(jobs)
wg.Wait()
}
C# (Parallel.ForEach)
using OfficeOxide;
var exts = new HashSet<string> { ".docx", ".xlsx", ".pptx" };
var paths = Directory.EnumerateFiles("corpus", "*", SearchOption.AllDirectories)
.Where(p => exts.Contains(Path.GetExtension(p).ToLowerInvariant()))
.ToList();
Parallel.ForEach(paths, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount }, path =>
{
try
{
using var doc = Document.Open(path);
File.WriteAllText(Path.ChangeExtension(path, ".md"), doc.ToMarkdown());
}
catch (OfficeOxideException) { /* skip unreadable */ }
});
JavaScript(Promise.all)
import { readdirSync, statSync } from 'node:fs';
import { join, extname } from 'node:path';
import { Document } from 'office-oxide';
const exts = new Set(['.docx', '.xlsx', '.pptx', '.doc', '.xls', '.ppt']);
function* walk(dir) {
for (const name of readdirSync(dir)) {
const full = join(dir, name);
if (statSync(full).isDirectory()) yield* walk(full);
else yield full;
}
}
const paths = [...walk('corpus')].filter(p => exts.has(extname(p).toLowerCase()));
// 限制并发
const CONCURRENCY = 8;
let i = 0;
async function worker() {
while (i < paths.length) {
const path = paths[i++];
using doc = Document.open(path);
// 处理 doc.toMarkdown()...
}
}
await Promise.all(Array.from({ length: CONCURRENCY }, worker));
异步 — 输入来自别处时
如果输入来自 HTTP、S3 或队列,异步 I/O 会赢,因为网络主导了解析时间。从字节打开,绝不要从路径打开。
Python(asyncio + aiohttp)
import asyncio, aiohttp
from office_oxide import Document
async def fetch_and_extract(session, url):
async with session.get(url) as r:
data = await r.read()
fmt = url.rsplit(".", 1)[-1].lower()
with Document.from_bytes(data, fmt) as doc:
return doc.plain_text()
async def main(urls):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*(fetch_and_extract(session, u) for u in urls))
Rust(tokio)
use office_oxide::{Document, DocumentFormat};
use std::io::Cursor;
let bytes = reqwest::get(url).await?.bytes().await?;
let fmt = DocumentFormat::Docx;
// 把解析挪到阻塞任务 — 抽取是 CPU 密集
let text = tokio::task::spawn_blocking(move || -> office_oxide::Result<String> {
let doc = Document::from_reader(Cursor::new(bytes.to_vec()), fmt)?;
Ok(doc.plain_text())
}).await??;
JavaScript (fetch + concurrency limit)
import { Document } from 'office-oxide';
async function fetchAndExtract(url) {
const res = await fetch(url);
const buf = Buffer.from(await res.arrayBuffer());
const fmt = url.split('.').pop().toLowerCase();
using doc = Document.fromBytes(buf, fmt);
return doc.plainText();
}
const CONCURRENCY = 16;
const queue = [...urls];
const results = [];
await Promise.all(Array.from({ length: CONCURRENCY }, async () => {
while (queue.length) {
const url = queue.shift();
results.push(await fetchAndExtract(url));
}
}));
Go (HTTP fan-out)
package main
import (
"io"
"net/http"
"strings"
"sync"
officeoxide "github.com/yfedoseev/office_oxide/go"
)
func fetchAndExtract(url string) (string, error) {
resp, err := http.Get(url)
if err != nil { return "", err }
defer resp.Body.Close()
data, err := io.ReadAll(resp.Body)
if err != nil { return "", err }
fmt := url[strings.LastIndex(url, ".")+1:]
doc, err := officeoxide.OpenFromBytes(data, fmt)
if err != nil { return "", err }
defer doc.Close()
return doc.PlainText()
}
func main() {
urls := []string{ /* ... */ }
sem := make(chan struct{}, 16) // concurrency cap
var wg sync.WaitGroup
for _, u := range urls {
wg.Add(1)
sem <- struct{}{}
go func(url string) {
defer wg.Done()
defer func() { <-sem }()
text, _ := fetchAndExtract(url)
_ = text // process...
}(u)
}
wg.Wait()
}
C# (HttpClient + async)
using OfficeOxide;
using var http = new HttpClient();
async Task<string> FetchAndExtract(string url)
{
var data = await http.GetByteArrayAsync(url);
var fmt = url[(url.LastIndexOf('.') + 1)..].ToLowerInvariant();
using var doc = Document.FromBytes(data, fmt);
return doc.PlainText();
}
// 用 SemaphoreSlim 限制并发
var sem = new SemaphoreSlim(16);
var tasks = urls.Select(async url =>
{
await sem.WaitAsync();
try { return await FetchAndExtract(url); }
finally { sem.Release(); }
});
var results = await Task.WhenAll(tasks);
内存提示
- 对超大 XLSX,在 Rust 里启用
mmap功能开关(features = ["mmap"])并调用Document::open_mmap,避免把整个归档复制进堆。 - 每个工作进程一次只开一个
Document。每个句柄都把已解析结构放在内存里;关闭它(Rust drop、Python 退出with块、JSclose()/using)才会释放。 - 在大规模 LLM 接入里,优先
to_markdown()而不是to_html()— Markdown 输出更小、下游 LLM 吞吐更好。
相关链接
- 性能基准 — 含 p99 的完整数据
- Office for RAG — RAG 专属模式