| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191 |
- import io
- import os
- import subprocess
- import time
- from typing import Optional
- from fastapi import FastAPI, File, HTTPException, UploadFile
- from fastapi.responses import FileResponse, RedirectResponse
- from PIL import Image # Pillow library for image processing
- from pydantic import BaseModel
- # Response models
- class ConvertResponse(BaseModel):
- pdf_path: str
- converted: bool
- original: Optional[str] = None
- file_size: Optional[int] = None
- error: Optional[str] = None
- class HealthResponse(BaseModel):
- status: str
- service: str
- version: str
- uptime: float
- # FastAPI Application
- app = FastAPI(
- title="LibreOffice Document Conversion Service",
- description="Convert Word/PPT/Excel/PDF to PDF and support mixed content document processing",
- version="1.0.0",
- docs_url="/docs",
- redoc_url="/redoc"
- )
- start_time = time.time()
- @app.get("/", include_in_schema=False)
- async def root():
- """Redirect to documentation page"""
- return RedirectResponse(url="/docs")
- @app.get("/health", response_model=HealthResponse)
- async def health():
- """Health check interface"""
- return HealthResponse(
- status="healthy",
- service="libreoffice-converter",
- version="1.0.0",
- uptime=time.time() - start_time
- )
- @app.post("/convert")
- async def convert(file: UploadFile = File(...)):
- """
- Document conversion interface
- Returns: PDF file stream
- """
- try:
- # File format validation
- allowed_extensions = [
- '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx',
- '.md', '.txt', '.rtf', '.odt', '.ods', '.odp',
- '.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'
- ]
- file_ext = os.path.splitext(file.filename)[1].lower()
- if file_ext not in allowed_extensions:
- raise HTTPException(
- status_code=400,
- detail=f"Unsupported file format: {file_ext}. Supported formats: {', '.join(allowed_extensions)}"
- )
- # Check uploads directory existence
- upload_dir = "/app/uploads" if os.path.exists("/app/uploads") else "./uploads"
- os.makedirs(upload_dir, exist_ok=True)
- # Save uploaded file
- filepath = os.path.join(upload_dir, file.filename)
- with open(filepath, "wb") as buffer:
- content = await file.read()
- buffer.write(content)
- # For PDF files, return directly without conversion
- if file_ext == '.pdf':
- return FileResponse(filepath, filename=file.filename, media_type='application/pdf')
- if file_ext == '.md':
- # Use Node.js script to render Markdown to PDF
- expected_pdf = filepath.rsplit('.', 1)[0] + '.pdf'
- cmd = [
- 'node',
- '/app/md_to_pdf.js',
- filepath,
- expected_pdf
- ]
- elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp']:
- # For image files, use Pillow to convert to PDF
- expected_pdf = filepath.rsplit('.', 1)[0] + '.pdf'
- # Open image and save as PDF
- with Image.open(filepath) as img:
- # Convert RGBA mode to RGB (support for transparent images)
- if img.mode in ('RGBA', 'LA', 'P'):
- # Convert to white background
- background = Image.new('RGB', img.size, (255, 255, 255))
- if img.mode == 'P':
- img = img.convert('RGBA')
- background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
- img = background
- elif img.mode != 'RGB':
- img = img.convert('RGB')
- # Save as PDF
- img.save(expected_pdf, 'PDF', resolution=100.0, save_all=False)
- # Verify PDF generation completed
- if not os.path.exists(expected_pdf):
- raise HTTPException(
- status_code=500,
- detail="Image to PDF conversion succeeded but output file not found"
- )
-
- # Image conversion completed, return PDF file
- filename_base = os.path.splitext(file.filename)[0]
- return FileResponse(expected_pdf, filename=f"{filename_base}.pdf", media_type='application/pdf')
- else:
- # Conversion using LibreOffice
- cmd = [
- 'soffice',
- '--headless',
- '--convert-to', 'pdf',
- '--outdir', upload_dir,
- filepath
- ]
- result = subprocess.run(
- cmd,
- capture_output=True,
- text=True,
- timeout=600, # Extended to 10 minutes to support complex Markdown conversion
- )
- # Combine stdout and stderr for error reporting since capture_output uses PIPE
- combined_output = result.stdout if result.stdout else ""
- if result.stderr:
- combined_output += "\n" + result.stderr
- # Display Node.js script output for debugging
- print(f"Node.js script output: {combined_output}")
- if result.returncode != 0:
- print(f"Subprocess failed with return code: {result.returncode}")
- # Combine stdout and stderr for error reporting
- combined_output = result.stdout if result.stdout else ""
- if result.stderr:
- combined_output += "\n" + result.stderr
- print(f"Subprocess output: {combined_output}")
- raise HTTPException(
- status_code=500,
- detail=f"Conversion failed: {combined_output}"
- )
- # Verify output file
- expected_pdf = filepath.rsplit('.', 1)[0] + '.pdf'
- if not os.path.exists(expected_pdf):
- raise HTTPException(
- status_code=500,
- detail="Conversion succeeded but output file not found"
- )
- filename_base = os.path.splitext(file.filename)[0]
- return FileResponse(expected_pdf, filename=f"{filename_base}.pdf", media_type='application/pdf')
- except HTTPException:
- raise
- except subprocess.TimeoutExpired:
- raise HTTPException(status_code=504, detail="Conversion timeout (300 seconds)")
- except Exception as e:
- raise HTTPException(status_code=500, detail=str(e))
- @app.get("/version")
- async def version():
- """Version information"""
- return {
- "service": "libreoffice-converter",
- "version": "1.0.0",
- "framework": "FastAPI",
- "libreoffice": "7.x"
- }
|