macOS 如何自动翻译一本书

2024-07-16

Development

假设书的原始内容来自 pdf，首先通过 pdftotext 获取 pdf 的文本内容，储存为 book.txt:

1	pdftotext book.pdf txt/book.txt

使用 python 配合 DeepLX，储存翻译后的结果为 translated_book.txt:

1	python trans_book.py --i txt/book.txt --o txt/translated_book.txt

这是 trans_book.py 的内容，其中 https.txt 是一些可用的 DeepLX 服务:

import requests
import os
import random
import argparse
from datetime import datetime
import time

LOCAL_API_URL = "http://localhost:1188/translate"
API_TOKEN = "your_access_token"
current_path = os.path.dirname(os.path.realpath(__file__))
SERVERS_FILE = "https.txt"
USE_LOCAL_ONLY = False
MAX_RETRIES = 3
RETRY_DELAY = 5  # seconds
TIMEOUT = 30  # seconds

def load_servers():
    with open(SERVERS_FILE, 'r') as f:
        return [line.strip() for line in f if line.strip().startswith('https://')]

def save_servers(servers):
    with open(SERVERS_FILE, 'w') as f:
        for server in servers:
            f.write(f"{server}\n")

def translate_text(text, source_lang, target_lang="ZH"):
    servers = [LOCAL_API_URL] if LOCAL_API_URL.startswith('https://') else []
    if not USE_LOCAL_ONLY:
        servers += load_servers()
    random.shuffle(servers)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {API_TOKEN}"
    }
    data = {
        "text": text,
        "source_lang": source_lang,
        "target_lang": target_lang
    }
    
    for retry in range(MAX_RETRIES):
        for server in servers:
            try:
                response = requests.post(f"{server}/translate", json=data, headers=headers, timeout=TIMEOUT)
                response.raise_for_status()
                result = response.json()
                if "data" in result and result["data"]:
                    print(f"Successfully translated using {server}")
                    return result, server
                else:
                    print(f"Server {server} returned empty result. Trying next server.")
            except requests.exceptions.RequestException as e:
                print(f"Failed to connect to {server}: {e}")
        
        if servers:
            print(f"All servers failed. Retrying in {RETRY_DELAY} seconds... (Attempt {retry + 1}/{MAX_RETRIES})")
            time.sleep(RETRY_DELAY)
        else:
            print("No servers available. Aborting.")
            break
    
    print("Max retries reached or no servers available. Translation failed.")
    return None, None

def remove_unavailable_server(unavailable_server):
    servers = load_servers()
    if unavailable_server in servers:
        servers.remove(unavailable_server)
        save_servers(servers)
        print(f"Removed unavailable server {unavailable_server} from {SERVERS_FILE}")

def save_translation(original, translated, server, output_file):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(output_file, "a", encoding="utf-8") as f:
        f.write(f"<p>{original}</p>\n")
        f.write(f"<p>{translated}</p>\n")

def split_into_paragraphs(text, max_length=1000):
    paragraphs = []
    current_paragraph = ""
    
    for line in text.split('\n'):
        if len(current_paragraph) + len(line) > max_length and current_paragraph:
            paragraphs.append(current_paragraph.strip())
            current_paragraph = ""
        current_paragraph += line + '\n'
    
    if current_paragraph:
        paragraphs.append(current_paragraph.strip())
    
    return paragraphs

def main(source_lang, input_file, output_file):
    print(f"Translating from {source_lang} to Chinese.")
    print(f"Input file: {input_file}")
    print(f"Output file: {output_file}")
    print(f"Using local service only: {'Yes' if USE_LOCAL_ONLY else 'No'}")
    print()

    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    paragraphs = split_into_paragraphs(content)
    total_paragraphs = len(paragraphs)

    for i, paragraph in enumerate(paragraphs, 1):
        print(f"Translating paragraph {i}/{total_paragraphs}")
        
        result, server = translate_text(paragraph, source_lang)
        if result and "data" in result:
            translated = result["data"]
            save_translation(paragraph, translated, server, output_file)
            print(f"Paragraph {i} translated successfully using {server}")
        else:
            print(f"Translation failed for paragraph {i}. Skipping...")

    print("Translation completed. Results saved to", output_file)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Book Translator")
    parser.add_argument("--lang", default="AUTO", help="Source language (default: AUTO)")
    parser.add_argument("--i", default="txt/book.txt", help="Input file path")
    parser.add_argument("--o", default="txt/translated_book.txt", help="Output file path")
    args = parser.parse_args()
    main(args.lang, args.i, args.o)

然后使用 pandoc 转换翻译结果为 book.html:

1	pandoc translated_book.txt -o web/book.html --css=tokyo.css -t html --template=template.html --metadata title="Book Title"

html 的默认样式，tokyo.css 类似如下，灵感来自 nvim 的 tokyonight 主题:

/* TokyoNight Theme for HTML with Reading Enhancements */

body {
  background-color: #282c34;
  color: #abb2bf;
  font-family: "Fira Code", monospace;
  line-height: 1.6; /* Increased line height for better readability */
  text-align: justify; /* Justify text for even spacing */
  padding: 12px 24px;
}

a {
  color: #81a1c1;
}

a:hover {
  color: #88c0d0;
}

h1,
h2,
h3,
h4,
h5,
h6 {
  color: #e06c75;
}

code,
pre {
  background-color: #373b41;
  color: #abb2bf;
  padding: 0.2em; /* Add padding for better code readability */
}

.comment {
  color: #5c6370;
}

.string {
  color: #98c379;
}

.keyword {
  color: #c678dd;
}

.number {
  color: #d19a66;
}

.function {
  color: #61aeee;
}

.boolean {
  color: #c678dd;
}

.operator {
  color: #abb2bf;
}

.punctuation {
  color: #abb2bf;
}

/* Highlight lines */
.highlight {
  background-color: #44475a;
}

template.html 模版类似:

<!doctype html>
<html lang="zh-CN">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta name="robots" content="noindex,nofollow" />
    <title>$title$</title>
    <link rel="stylesheet" href="$css$" />
  </head>
  <body>
    $body$
  </body>
</html>

最后还可以使用 pandoc 转换 html 为 epub 在手机上阅读:

1	pandoc web/book.html -o epub/book.epub --metadata-file=metadata.yml --css=book.css

metadata.yml:

---
title: "Book Title"
author: "The Author"
cover-image: "book_cover.png"
css: "book.css"
---

book.css:

body {
  font-family: "Georgia, serif"; /* 默认字体 */
  line-height: 1.6; /* 行高 */
  /* background-color: #d5d6db; */
  /* color: #2e3338; */
}

p {
  /* margin-bottom: 1em; /* 为段落添加一些间距 */ */
  /* color: #2e3338; */
}

h1,
h2,
h3,
h4,
h5,
h6 {
  font-family: "Arial, sans-serif"; /* 标题字体 */
  color: #2e3338;
}