wxcz_admin
/
agency-swarm-cn-git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
							import os
import re

import pandas as pd
import textstat


def process_inline_code(code):
    # Replace dot access with whitespace if detected
    if re.search(r"\w+\.\w+", code):
        code = code.replace(".", " ")
    # Replace square brackets with whitespace
    if "[" in code or "]" in code:
        code = code.replace("[", " ").replace("]", " ")
    return code


def clean_markdown(text):
    # Remove code blocks (triple backticks)
    text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)

    # Process inline code: Replace inline code blocks with their processed plaintext
    def inline_code_replacer(match):
        code_content = match.group(1)
        processed = process_inline_code(code_content)
        return processed  # Inline code is replaced with its processed content

    text = re.sub(r"`([^`]+)`", inline_code_replacer, text)

    # Replace markdown links with their placeholder value (the text inside the square brackets)
    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)

    # Optionally, remove or simplify other markdown formatting:
    # Remove headers (leading '#' characters) and emphasis markers
    text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)
    text = re.sub(r"[*_~]", "", text)

    return text


def compute_readability(file_path):
    with open(file_path, encoding="utf-8") as f:
        content = f.read()
    # Clean markdown formatting for better readability analysis
    plain_text = clean_markdown(content)

    scores = {
        "File": file_path,
        "Flesch-Kincaid": textstat.flesch_kincaid_grade(plain_text),
        "SMOG Index": textstat.smog_index(plain_text),
        "ARI Index": textstat.automated_readability_index(plain_text),
        "Coleman-Liau": textstat.coleman_liau_index(plain_text),
    }
    return scores


def analyze_docs(root_directory):
    results = []
    for root, _, files in os.walk(root_directory):
        for file in files:
            if file.endswith(".mdx") or file.endswith(".md"):
                file_path = os.path.join(root, file)
                file_path = file_path.replace("\\", "/")
                try:
                    scores = compute_readability(file_path)
                    results.append(scores)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
    return results


# Replace 'docs_directory' with the path to your documentation
docs_directory = r"."
readability_results = analyze_docs(docs_directory)

# Convert results to a DataFrame for a clean report
df = pd.DataFrame(readability_results)
print(df)

# Define the threshold
threshold = 15

# Filter the DataFrame where any of the readability scores is above the threshold
filtered_df = df[
    (df["Flesch-Kincaid"] > threshold)
    | (df["SMOG Index"] > threshold)
    | (df["ARI Index"] > threshold)
    | (df["Coleman-Liau"] > threshold)
]

# Extract the list of file names
pages_above_threshold = filtered_df["File"].tolist()
print("Pages with at least one readability score above", threshold, ":", pages_above_threshold)


# Optionally, save the report to CSV for further analysis
df.to_csv("readability_report.csv", index=False)

"""Explainer of results:
Each test shows how many years of education a person needs to be able to effectively read through the text.
Flesch-Kincaid and SMOG Index are popular linguistic benchmarks applicable for most texts
ARI Index and Coleman-Liau are recommended for technical texts.
Generally is it considered that the lower the number is the easier the text is to approach.
When analysing the results just look for outliers.
"""