download_real_medias.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. # -*- coding: utf-8 -*-
  2. """从 Wikimedia Commons / Pexels 下载真实照片到 seed_assets/medias。"""
  3. import json
  4. import time
  5. import urllib.parse
  6. import urllib.request
  7. from pathlib import Path
  8. ROOT = Path(__file__).resolve().parents[1]
  9. OUT = ROOT / 'seed_assets' / 'medias'
  10. OUT.mkdir(parents=True, exist_ok=True)
  11. UA = 'DockScope/1.0 (educational demo)'
  12. # (本地文件名, Wikimedia 文件名 或 None, 备用直链 URL)
  13. ITEMS = [
  14. (
  15. '01_concrete_crack_bridge.jpg',
  16. 'Darmsheim_Brücke03_2010-06-29.jpg',
  17. None,
  18. ),
  19. (
  20. '02_bridge_concrete_cracks.jpg',
  21. 'Darmsheim_Brücke04_2010-06-29.jpg',
  22. None,
  23. ),
  24. (
  25. '03_steel_bridge_corrosion.jpg',
  26. 'Nandu_River_Iron_Bridge_corrosion_-_02.jpg',
  27. None,
  28. ),
  29. (
  30. '04_concrete_bending_cracks.jpg',
  31. 'PHOTO_B_EMC_CemPozz_Feb_13.jpg',
  32. None,
  33. ),
  34. (
  35. '05_bridge_substructure.jpg',
  36. 'I-35W_bridge_structure_before_collapse.jpg',
  37. None,
  38. ),
  39. (
  40. '06_shrinkage_cracks_concrete.jpg',
  41. 'Beton-Schwindrisse.png',
  42. None,
  43. ),
  44. (
  45. '07_asphalt_crocodile_cracking.jpg',
  46. 'Cracked_asphalt.jpg',
  47. None,
  48. ),
  49. (
  50. '08_concrete_rebar_corrosion.jpg',
  51. 'Concrete_bridge_surface_reinforcement_corrosion_due_to_chlorides.jpg',
  52. 'https://images.pexels.com/photos/2219024/pexels-photo-2219024.jpeg?auto=compress&cs=tinysrgb&w=1600',
  53. ),
  54. (
  55. '09_steel_beam_site.jpg',
  56. 'Steel_beams.jpg',
  57. None,
  58. ),
  59. (
  60. '10_rust_metal_texture.jpg',
  61. None,
  62. 'https://images.pexels.com/photos/1157255/pexels-photo-1157255.jpeg?auto=compress&cs=tinysrgb&w=1600',
  63. ),
  64. ]
  65. API = 'https://commons.wikimedia.org/w/api.php'
  66. def commons_url(file_name: str) -> str | None:
  67. params = urllib.parse.urlencode(
  68. {
  69. 'action': 'query',
  70. 'titles': f'File:{file_name}',
  71. 'prop': 'imageinfo',
  72. 'iiprop': 'url',
  73. 'format': 'json',
  74. },
  75. encoding='utf-8',
  76. )
  77. req = urllib.request.Request(f'{API}?{params}', headers={'User-Agent': UA})
  78. with urllib.request.urlopen(req, timeout=60) as resp:
  79. data = json.loads(resp.read().decode('utf-8'))
  80. for page in data.get('query', {}).get('pages', {}).values():
  81. if 'missing' in page:
  82. return None
  83. info = page.get('imageinfo') or []
  84. if info:
  85. return info[0].get('url')
  86. return None
  87. def download(url: str, dest: Path) -> bool:
  88. if dest.exists() and dest.stat().st_size > 20_000:
  89. print(f' skip {dest.name} ({dest.stat().st_size // 1024} KB)')
  90. return True
  91. req = urllib.request.Request(url, headers={'User-Agent': UA})
  92. with urllib.request.urlopen(req, timeout=180) as resp:
  93. dest.write_bytes(resp.read())
  94. print(f' ok {dest.name} ({dest.stat().st_size // 1024} KB)')
  95. return True
  96. def main():
  97. for local, wiki, fallback in ITEMS:
  98. print(local)
  99. url = None
  100. if wiki:
  101. try:
  102. url = commons_url(wiki)
  103. except Exception as e:
  104. print(f' api error: {e}')
  105. if not url:
  106. url = fallback
  107. if not url:
  108. print(' no url')
  109. continue
  110. try:
  111. download(url, OUT / local)
  112. except Exception as e:
  113. print(f' fail: {e}')
  114. time.sleep(2.5)
  115. print('saved to', OUT)
  116. if __name__ == '__main__':
  117. main()