fill_inspect.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. #!/usr/bin/env python3
  2. """
  3. fill_inspect.py — Inspect form fields in an existing PDF.
  4. Usage:
  5. python3 fill_inspect.py --input form.pdf
  6. python3 fill_inspect.py --input form.pdf --out fields.json
  7. Outputs a JSON summary of every fillable field: name, type, current value,
  8. allowed values (for checkboxes / dropdowns), and page number.
  9. Exit codes: 0 success, 1 bad args / file not found, 2 dep missing, 3 read error
  10. """
  11. import argparse
  12. import json
  13. import sys
  14. import importlib.util
  15. import os
  16. def ensure_deps():
  17. if importlib.util.find_spec("pypdf") is None:
  18. import subprocess
  19. subprocess.check_call(
  20. [sys.executable, "-m", "pip", "install", "--break-system-packages", "-q", "pypdf"]
  21. )
  22. ensure_deps()
  23. from pypdf import PdfReader
  24. from pypdf.generic import ArrayObject, DictionaryObject, NameObject, TextStringObject
  25. # ── Field type resolution ──────────────────────────────────────────────────────
  26. def _field_type(field) -> str:
  27. ft = field.get("/FT")
  28. if ft is None:
  29. return "unknown"
  30. ft = str(ft)
  31. if ft == "/Tx":
  32. return "text"
  33. if ft == "/Btn":
  34. ff = int(field.get("/Ff", 0))
  35. return "radio" if ff & (1 << 15) else "checkbox"
  36. if ft == "/Ch":
  37. ff = int(field.get("/Ff", 0))
  38. return "dropdown" if ff & (1 << 17) else "listbox"
  39. if ft == "/Sig":
  40. return "signature"
  41. return "unknown"
  42. def _field_value(field) -> str | None:
  43. v = field.get("/V")
  44. return str(v) if v is not None else None
  45. def _field_options(field, ftype: str) -> dict:
  46. extra = {}
  47. if ftype in ("checkbox",):
  48. ap = field.get("/AP")
  49. if ap and "/N" in ap:
  50. states = [str(k) for k in ap["/N"]]
  51. extra["states"] = states
  52. checked = next((s for s in states if s != "/Off"), None)
  53. if checked:
  54. extra["checked_value"] = checked
  55. if ftype in ("dropdown", "listbox"):
  56. opt = field.get("/Opt")
  57. if opt:
  58. choices = []
  59. for item in opt:
  60. if isinstance(item, (list, ArrayObject)) and len(item) >= 2:
  61. choices.append({"value": str(item[0]), "label": str(item[1])})
  62. else:
  63. choices.append({"value": str(item), "label": str(item)})
  64. extra["choices"] = choices
  65. if ftype == "radio":
  66. kids = field.get("/Kids")
  67. if kids:
  68. values = []
  69. for kid in kids:
  70. ap = kid.get("/AP")
  71. if ap and "/N" in ap:
  72. for k in ap["/N"]:
  73. if str(k) != "/Off":
  74. values.append(str(k))
  75. extra["radio_values"] = values
  76. return extra
  77. def _walk_fields(fields, page_map: dict, parent_name: str = "") -> list:
  78. """Recursively collect all leaf fields."""
  79. result = []
  80. for field in fields:
  81. name = str(field.get("/T", ""))
  82. full = f"{parent_name}.{name}" if parent_name else name
  83. kids = field.get("/Kids")
  84. # Kids that have /T are sub-fields (groups), not widget annotations
  85. if kids:
  86. named_kids = [k for k in kids if "/T" in k]
  87. if named_kids:
  88. result.extend(_walk_fields(named_kids, page_map, full))
  89. continue
  90. ftype = _field_type(field)
  91. if ftype == "unknown":
  92. continue
  93. entry = {
  94. "name": full,
  95. "type": ftype,
  96. "value": _field_value(field),
  97. }
  98. entry.update(_field_options(field, ftype))
  99. # Page lookup via /P indirect reference
  100. p_ref = field.get("/P")
  101. if p_ref and hasattr(p_ref, "idnum"):
  102. entry["page"] = page_map.get(p_ref.idnum, "?")
  103. result.append(entry)
  104. return result
  105. def inspect(pdf_path: str) -> dict:
  106. try:
  107. reader = PdfReader(pdf_path)
  108. except Exception as e:
  109. return {"status": "error", "error": str(e)}
  110. # Build page-number lookup: {object_id: 1-based page number}
  111. page_map = {}
  112. for i, page in enumerate(reader.pages):
  113. if hasattr(page, "indirect_reference") and page.indirect_reference:
  114. page_map[page.indirect_reference.idnum] = i + 1
  115. acroform = reader.trailer.get("/Root", {}).get("/AcroForm")
  116. if acroform is None or "/Fields" not in acroform:
  117. return {
  118. "status": "ok",
  119. "has_fields": False,
  120. "field_count": 0,
  121. "fields": [],
  122. "note": "This PDF has no fillable form fields.",
  123. }
  124. fields = _walk_fields(list(acroform["/Fields"]), page_map)
  125. return {
  126. "status": "ok",
  127. "has_fields": bool(fields),
  128. "field_count": len(fields),
  129. "fields": fields,
  130. }
  131. def main():
  132. parser = argparse.ArgumentParser(description="Inspect PDF form fields")
  133. parser.add_argument("--input", required=True, help="PDF file to inspect")
  134. parser.add_argument("--out", default="", help="Write JSON to file (optional)")
  135. args = parser.parse_args()
  136. if not os.path.exists(args.input):
  137. print(json.dumps({"status": "error", "error": f"File not found: {args.input}"}),
  138. file=sys.stderr)
  139. sys.exit(1)
  140. result = inspect(args.input)
  141. output = json.dumps(result, indent=2, ensure_ascii=False)
  142. if args.out:
  143. with open(args.out, "w") as f:
  144. f.write(output)
  145. print(output)
  146. # Human-readable summary
  147. if result["status"] == "ok" and result["has_fields"]:
  148. print(f"\n── Fields in {args.input} ──────────────────────────────",
  149. file=sys.stderr)
  150. for f in result["fields"]:
  151. pg = f" p.{f['page']}" if "page" in f else ""
  152. val = f" = {f['value']}" if f.get("value") else ""
  153. extra = ""
  154. if "choices" in f:
  155. extra = f" [{', '.join(c['value'] for c in f['choices'][:4])}{'…' if len(f['choices'])>4 else ''}]"
  156. elif "states" in f:
  157. extra = f" {f['states']}"
  158. print(f" {f['type']:12} {f['name']}{pg}{val}{extra}", file=sys.stderr)
  159. print("", file=sys.stderr)
  160. if __name__ == "__main__":
  161. main()