main.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. import io
  2. import os
  3. import subprocess
  4. import time
  5. from typing import Optional
  6. from fastapi import FastAPI, File, HTTPException, UploadFile
  7. from fastapi.responses import FileResponse, RedirectResponse
  8. from PIL import Image # Pillow library for image processing
  9. from pydantic import BaseModel
  10. # Response models
  11. class ConvertResponse(BaseModel):
  12. pdf_path: str
  13. converted: bool
  14. original: Optional[str] = None
  15. file_size: Optional[int] = None
  16. error: Optional[str] = None
  17. class HealthResponse(BaseModel):
  18. status: str
  19. service: str
  20. version: str
  21. uptime: float
  22. # FastAPI Application
  23. app = FastAPI(
  24. title="LibreOffice Document Conversion Service",
  25. description="Convert Word/PPT/Excel/PDF to PDF and support mixed content document processing",
  26. version="1.0.0",
  27. docs_url="/docs",
  28. redoc_url="/redoc"
  29. )
  30. start_time = time.time()
  31. @app.get("/", include_in_schema=False)
  32. async def root():
  33. """Redirect to documentation page"""
  34. return RedirectResponse(url="/docs")
  35. @app.get("/health", response_model=HealthResponse)
  36. async def health():
  37. """Health check interface"""
  38. return HealthResponse(
  39. status="healthy",
  40. service="libreoffice-converter",
  41. version="1.0.0",
  42. uptime=time.time() - start_time
  43. )
  44. @app.post("/convert")
  45. async def convert(file: UploadFile = File(...)):
  46. """
  47. Document conversion interface
  48. Returns: PDF file stream
  49. """
  50. try:
  51. # File format validation
  52. allowed_extensions = [
  53. '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx',
  54. '.md', '.txt', '.rtf', '.odt', '.ods', '.odp',
  55. '.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'
  56. ]
  57. file_ext = os.path.splitext(file.filename)[1].lower()
  58. if file_ext not in allowed_extensions:
  59. raise HTTPException(
  60. status_code=400,
  61. detail=f"Unsupported file format: {file_ext}. Supported formats: {', '.join(allowed_extensions)}"
  62. )
  63. # Check uploads directory existence
  64. upload_dir = "/app/uploads" if os.path.exists("/app/uploads") else "./uploads"
  65. os.makedirs(upload_dir, exist_ok=True)
  66. # Save uploaded file
  67. filepath = os.path.join(upload_dir, file.filename)
  68. with open(filepath, "wb") as buffer:
  69. content = await file.read()
  70. buffer.write(content)
  71. # For PDF files, return directly without conversion
  72. if file_ext == '.pdf':
  73. return FileResponse(filepath, filename=file.filename, media_type='application/pdf')
  74. if file_ext == '.md':
  75. # Use Node.js script to render Markdown to PDF
  76. expected_pdf = filepath.rsplit('.', 1)[0] + '.pdf'
  77. cmd = [
  78. 'node',
  79. '/app/md_to_pdf.js',
  80. filepath,
  81. expected_pdf
  82. ]
  83. elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp']:
  84. # For image files, use Pillow to convert to PDF
  85. expected_pdf = filepath.rsplit('.', 1)[0] + '.pdf'
  86. # Open image and save as PDF
  87. with Image.open(filepath) as img:
  88. # Convert RGBA mode to RGB (support for transparent images)
  89. if img.mode in ('RGBA', 'LA', 'P'):
  90. # Convert to white background
  91. background = Image.new('RGB', img.size, (255, 255, 255))
  92. if img.mode == 'P':
  93. img = img.convert('RGBA')
  94. background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
  95. img = background
  96. elif img.mode != 'RGB':
  97. img = img.convert('RGB')
  98. # Save as PDF
  99. img.save(expected_pdf, 'PDF', resolution=100.0, save_all=False)
  100. # Verify PDF generation completed
  101. if not os.path.exists(expected_pdf):
  102. raise HTTPException(
  103. status_code=500,
  104. detail="Image to PDF conversion succeeded but output file not found"
  105. )
  106. # Image conversion completed, return PDF file
  107. filename_base = os.path.splitext(file.filename)[0]
  108. return FileResponse(expected_pdf, filename=f"{filename_base}.pdf", media_type='application/pdf')
  109. else:
  110. # Conversion using LibreOffice
  111. cmd = [
  112. 'soffice',
  113. '--headless',
  114. '--convert-to', 'pdf',
  115. '--outdir', upload_dir,
  116. filepath
  117. ]
  118. result = subprocess.run(
  119. cmd,
  120. capture_output=True,
  121. text=True,
  122. timeout=600, # Extended to 10 minutes to support complex Markdown conversion
  123. )
  124. # Combine stdout and stderr for error reporting since capture_output uses PIPE
  125. combined_output = result.stdout if result.stdout else ""
  126. if result.stderr:
  127. combined_output += "\n" + result.stderr
  128. # Display Node.js script output for debugging
  129. print(f"Node.js script output: {combined_output}")
  130. if result.returncode != 0:
  131. print(f"Subprocess failed with return code: {result.returncode}")
  132. # Combine stdout and stderr for error reporting
  133. combined_output = result.stdout if result.stdout else ""
  134. if result.stderr:
  135. combined_output += "\n" + result.stderr
  136. print(f"Subprocess output: {combined_output}")
  137. raise HTTPException(
  138. status_code=500,
  139. detail=f"Conversion failed: {combined_output}"
  140. )
  141. # Verify output file
  142. expected_pdf = filepath.rsplit('.', 1)[0] + '.pdf'
  143. if not os.path.exists(expected_pdf):
  144. raise HTTPException(
  145. status_code=500,
  146. detail="Conversion succeeded but output file not found"
  147. )
  148. filename_base = os.path.splitext(file.filename)[0]
  149. return FileResponse(expected_pdf, filename=f"{filename_base}.pdf", media_type='application/pdf')
  150. except HTTPException:
  151. raise
  152. except subprocess.TimeoutExpired:
  153. raise HTTPException(status_code=504, detail="Conversion timeout (300 seconds)")
  154. except Exception as e:
  155. raise HTTPException(status_code=500, detail=str(e))
  156. @app.get("/version")
  157. async def version():
  158. """Version information"""
  159. return {
  160. "service": "libreoffice-converter",
  161. "version": "1.0.0",
  162. "framework": "FastAPI",
  163. "libreoffice": "7.x"
  164. }