1
Files
2025/tools/stats/common.py
2025-11-28 01:47:10 +01:00

167 lines
4.3 KiB
Python

#!/usr/bin/env python3
import os
import re
import json
import yaml
from datetime import datetime, date, timezone
MONTH_LABELS = ["Jan", "Fev", "Mar", "Avr", "Mai", "Jun", "Jul", "Aou", "Sep", "Oct", "Nov", "Dec"]
def find_markdown_files(root):
files = []
for dirpath, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.lower().endswith(".md"):
continue
if filename == "_index.md":
continue
files.append(os.path.join(dirpath, filename))
return files
def collect_section_dirs(root):
section_dirs = set()
for dirpath, dirnames, filenames in os.walk(root):
if "_index.md" in filenames:
section_dirs.add(os.path.abspath(dirpath))
return section_dirs
def leaf_sections(section_dirs):
leaves = set()
for section in section_dirs:
is_leaf = True
for other in section_dirs:
if other == section:
continue
if other.startswith(section + os.sep):
is_leaf = False
break
if is_leaf:
leaves.add(section)
return leaves
def parse_frontmatter(path):
with open(path, "r", encoding="utf-8") as handle:
content = handle.read()
if content.startswith("---"):
parts = content.split("---", 2)
if len(parts) >= 3:
fm_text = parts[1]
body = parts[2]
else:
return {}, content
else:
return {}, content
try:
data = yaml.safe_load(fm_text) or {}
except Exception:
data = {}
return data, body
def parse_date(value):
if not value:
return None
dt = None
if isinstance(value, datetime):
dt = value
elif isinstance(value, date):
dt = datetime.combine(value, datetime.min.time())
elif isinstance(value, (int, float)):
try:
dt = datetime.fromtimestamp(value)
except Exception:
dt = None
elif isinstance(value, str):
# try ISO-like formats
for fmt in ("%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%Y/%m/%d", "%d/%m/%Y"):
try:
dt = datetime.strptime(value, fmt)
break
except Exception:
continue
if dt is None:
try:
dt = datetime.fromisoformat(value)
except Exception:
dt = None
if dt is None:
return None
if dt.tzinfo is not None:
dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
return dt
WORD_RE = re.compile(r"[\w'-]+", re.UNICODE)
def count_words(text):
if not text:
return 0
words = WORD_RE.findall(text)
return len(words)
def resolve_section(file_path, content_root, leaf_dirs):
content_root = os.path.abspath(content_root)
current = os.path.abspath(os.path.dirname(file_path))
best = None
while current.startswith(content_root):
if current in leaf_dirs:
best = current
break
parent = os.path.dirname(current)
if parent == current:
break
current = parent
if not best:
return None
rel = os.path.relpath(best, content_root)
return rel.replace(os.sep, "/") if rel != "." else "."
def load_articles(content_root):
files = find_markdown_files(content_root)
section_dirs = collect_section_dirs(content_root)
leaf_dirs = leaf_sections(section_dirs)
articles = []
for file_path in files:
fm, body = parse_frontmatter(file_path)
date = parse_date(fm.get("date"))
title = fm.get("title") or os.path.splitext(os.path.basename(file_path))[0]
word_count = count_words(body)
rel_path = os.path.relpath(file_path, content_root)
section = resolve_section(file_path, content_root, leaf_dirs)
weather = fm.get("weather") if isinstance(fm, dict) else None
articles.append(
{
"path": file_path,
"relativePath": rel_path,
"title": title,
"date": date,
"wordCount": word_count,
"section": section,
"weather": weather,
}
)
return articles
def write_result(data):
import sys
json.dump(data, sys.stdout)
sys.stdout.flush()