analise_docs.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. import os
  2. import re
  3. import pandas as pd
  4. import textstat
  5. def process_inline_code(code):
  6. # Replace dot access with whitespace if detected
  7. if re.search(r"\w+\.\w+", code):
  8. code = code.replace(".", " ")
  9. # Replace square brackets with whitespace
  10. if "[" in code or "]" in code:
  11. code = code.replace("[", " ").replace("]", " ")
  12. return code
  13. def clean_markdown(text):
  14. # Remove code blocks (triple backticks)
  15. text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)
  16. # Process inline code: Replace inline code blocks with their processed plaintext
  17. def inline_code_replacer(match):
  18. code_content = match.group(1)
  19. processed = process_inline_code(code_content)
  20. return processed # Inline code is replaced with its processed content
  21. text = re.sub(r"`([^`]+)`", inline_code_replacer, text)
  22. # Replace markdown links with their placeholder value (the text inside the square brackets)
  23. text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
  24. # Optionally, remove or simplify other markdown formatting:
  25. # Remove headers (leading '#' characters) and emphasis markers
  26. text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)
  27. text = re.sub(r"[*_~]", "", text)
  28. return text
  29. def compute_readability(file_path):
  30. with open(file_path, encoding="utf-8") as f:
  31. content = f.read()
  32. # Clean markdown formatting for better readability analysis
  33. plain_text = clean_markdown(content)
  34. scores = {
  35. "File": file_path,
  36. "Flesch-Kincaid": textstat.flesch_kincaid_grade(plain_text),
  37. "SMOG Index": textstat.smog_index(plain_text),
  38. "ARI Index": textstat.automated_readability_index(plain_text),
  39. "Coleman-Liau": textstat.coleman_liau_index(plain_text),
  40. }
  41. return scores
  42. def analyze_docs(root_directory):
  43. results = []
  44. for root, _, files in os.walk(root_directory):
  45. for file in files:
  46. if file.endswith(".mdx") or file.endswith(".md"):
  47. file_path = os.path.join(root, file)
  48. file_path = file_path.replace("\\", "/")
  49. try:
  50. scores = compute_readability(file_path)
  51. results.append(scores)
  52. except Exception as e:
  53. print(f"Error processing {file_path}: {e}")
  54. return results
  55. # Replace 'docs_directory' with the path to your documentation
  56. docs_directory = r"."
  57. readability_results = analyze_docs(docs_directory)
  58. # Convert results to a DataFrame for a clean report
  59. df = pd.DataFrame(readability_results)
  60. print(df)
  61. # Define the threshold
  62. threshold = 15
  63. # Filter the DataFrame where any of the readability scores is above the threshold
  64. filtered_df = df[
  65. (df["Flesch-Kincaid"] > threshold)
  66. | (df["SMOG Index"] > threshold)
  67. | (df["ARI Index"] > threshold)
  68. | (df["Coleman-Liau"] > threshold)
  69. ]
  70. # Extract the list of file names
  71. pages_above_threshold = filtered_df["File"].tolist()
  72. print("Pages with at least one readability score above", threshold, ":", pages_above_threshold)
  73. # Optionally, save the report to CSV for further analysis
  74. df.to_csv("readability_report.csv", index=False)
  75. """Explainer of results:
  76. Each test shows how many years of education a person needs to be able to effectively read through the text.
  77. Flesch-Kincaid and SMOG Index are popular linguistic benchmarks applicable for most texts
  78. ARI Index and Coleman-Liau are recommended for technical texts.
  79. Generally is it considered that the lower the number is the easier the text is to approach.
  80. When analysing the results just look for outliers.
  81. """