Skip to content

处理字节与流

Office Oxide 把原始字节当作一等输入。无需先写到临时文件再解析 — 在无服务器处理器、multipart 上传、S3 对象和数据库 blob 中很有用。

来自 HTTP 响应

Python

import requests
from office_oxide import Document

resp = requests.get("https://example.com/report.docx")
with Document.from_bytes(resp.content, "docx") as doc:
    print(doc.to_markdown())

JavaScript

import { Document } from 'office-oxide';

const res = await fetch('https://example.com/report.docx');
const data = new Uint8Array(await res.arrayBuffer());
using doc = Document.fromBytes(data, 'docx');
console.log(doc.toMarkdown());

Rust(reqwest)

use std::io::Cursor;
use office_oxide::{Document, DocumentFormat};

let bytes = reqwest::blocking::get(url)?.bytes()?;
let doc = Document::from_reader(Cursor::new(bytes.to_vec()), DocumentFormat::Docx)?;

Go

resp, _ := http.Get(url)
defer resp.Body.Close()
data, _ := io.ReadAll(resp.Body)
doc, _ := officeoxide.OpenFromBytes(data, "docx")

C#

using var http = new HttpClient();
byte[] data = await http.GetByteArrayAsync(url);
using var doc = Document.FromBytes(data, "docx");

来自 S3

Python(boto3)

import boto3
from office_oxide import Document

s3 = boto3.client("s3")
obj = s3.get_object(Bucket="bucket", Key="reports/q4.xlsx")
data = obj["Body"].read()
with Document.from_bytes(data, "xlsx") as doc:
    print(doc.to_markdown())

Rust(aws-sdk-s3)

use aws_sdk_s3::Client;
use std::io::Cursor;
use office_oxide::{Document, DocumentFormat};

let client = Client::new(&aws_config::load_from_env().await);
let obj = client.get_object().bucket("bucket").key("reports/q4.xlsx").send().await?;
let bytes = obj.body.collect().await?.into_bytes();
let doc = Document::from_reader(Cursor::new(bytes.to_vec()), DocumentFormat::Xlsx)?;

来自 multipart 上传(Web 框架)

Python(FastAPI)

from fastapi import FastAPI, UploadFile
from office_oxide import Document

app = FastAPI()

@app.post("/extract")
async def extract(file: UploadFile):
    data = await file.read()
    fmt = file.filename.rsplit(".", 1)[-1].lower()
    with Document.from_bytes(data, fmt) as doc:
        return {"markdown": doc.to_markdown()}

JavaScript(Hono / Express)

import { Hono } from 'hono';
import { Document } from 'office-oxide';

const app = new Hono();

app.post('/extract', async (c) => {
  const body = await c.req.parseBody();
  const file = body.file;        // File
  const data = new Uint8Array(await file.arrayBuffer());
  const fmt = file.name.split('.').pop().toLowerCase();
  using doc = Document.fromBytes(data, fmt);
  return c.json({ markdown: doc.toMarkdown() });
});

来自数据库 BLOB

Python(SQLAlchemy)

from sqlalchemy import create_engine, text
from office_oxide import Document

engine = create_engine("postgresql://...")
with engine.begin() as conn:
    row = conn.execute(text("SELECT data, mime FROM uploads WHERE id = :id"),
                       {"id": 42}).one()
fmt = {"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
       "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":     "xlsx",
       "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx"
       }[row.mime]
with Document.from_bytes(row.data, fmt) as doc:
    print(doc.plain_text())

写回字节(用于上传)

编辑之后,可以把结果写到缓冲并直接流式发送给客户端。

Python

from office_oxide import EditableDocument
from io import BytesIO

with EditableDocument.open("template.docx") as ed:
    ed.replace_text("{{name}}", "Alice")
    bytes_out = ed.save_to_bytes()

# 上传到 S3、作为 HTTP 响应返回等

JavaScript

using ed = EditableDocument.open('template.docx');
ed.replaceText('{{name}}', 'Alice');
const bytes = ed.saveToBytes();   // Uint8Array
return new Response(bytes, {
  headers: { 'Content-Type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' },
});

Rust

use office_oxide::edit::EditableDocument;

let mut ed = EditableDocument::open("template.docx")?;
ed.replace_text("{{name}}", "Alice");

let mut buf = std::io::Cursor::new(Vec::new());
ed.write_to(&mut buf)?;
let bytes: Vec<u8> = buf.into_inner();

选择 format 字符串

from_bytes 需要你告诉它格式。可接受的字符串严格如下:

"docx" | "xlsx" | "pptx" | "doc" | "xls" | "ppt"

源未知时,先用 detect_format:

import office_oxide
fmt = office_oxide.detect_format("payload.bin")  # → "docx" | None
if fmt:
    with Document.from_bytes(data, fmt) as doc:
        ...

检测器读取魔术字节(OOXML 的 ZIP、旧版的 CFB D0 CF 11 E0)外加快速 part 列表检查以区分 .docx / .xlsx / .pptx

相关链接