Coverage for src/document_parser.py: 63%
175 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-08 05:40 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-08 05:40 +0000
1import re
2import os
3from pathlib import Path
4from typing import Dict, List, Optional, Tuple
5from dataclasses import dataclass
7@dataclass
8class Section:
9 id: str
10 title: str
11 level: int
12 content: str
13 line_start: int
14 line_end: int
15 source_file: str # Relative path to source file
16 children: List[str] # Store child IDs instead of Section objects
17 parent_id: Optional[str] = None # Store parent ID instead of Section object
18 document_position: int = 0 # Position in resolved document for proper sorting
20class DocumentParser:
21 def __init__(self, max_include_depth: int = 4):
22 self.max_include_depth = max_include_depth
23 self.processed_files = set()
25 def parse_project(self, root_file: Path) -> Tuple[Dict[str, Section], set]:
26 """Parse a documentation project starting from root file
28 Returns:
29 Tuple of (sections_dict, included_files_set)
30 - sections_dict: Dictionary mapping section IDs to Section objects
31 - included_files_set: Set of Path objects for files that were included (excluding root_file)
32 """
33 self.processed_files.clear()
34 content, line_sources = self._resolve_includes_with_sources(root_file, 0)
35 sections = self._parse_structure_with_sources(content, line_sources)
37 # Get included files (all processed files except the root file itself)
38 included_files = {Path(f) for f in self.processed_files if Path(f) != root_file}
40 return sections, included_files
42 def _resolve_includes(self, file_path: Path, depth: int) -> str:
43 """Resolve include directives recursively"""
44 if depth >= self.max_include_depth or str(file_path) in self.processed_files:
45 return f"// Include depth limit reached or circular reference: {file_path}\n"
47 self.processed_files.add(str(file_path))
49 try:
50 content = file_path.read_text(encoding='utf-8')
51 except Exception:
52 return f"// Error reading file: {file_path}\n"
54 # Handle AsciiDoc includes: include::file.adoc[]
55 include_pattern = r'include::([^[\]]+)\[\]'
57 def replace_include(match):
58 include_file = match.group(1)
59 include_path = file_path.parent / include_file
60 return self._resolve_includes(include_path, depth + 1)
62 return re.sub(include_pattern, replace_include, content)
64 def _resolve_includes_with_sources(self, file_path: Path, depth: int) -> Tuple[str, List[str]]:
65 """Resolve include directives recursively while tracking source files for each line
67 Returns:
68 Tuple of (content, line_sources)
69 - content: Resolved content string
70 - line_sources: List where line_sources[i] is the source file path for line i
71 """
72 if depth >= self.max_include_depth or str(file_path) in self.processed_files:
73 error_line = f"// Include depth limit reached or circular reference: {file_path}\n"
74 return error_line, [str(file_path)]
76 self.processed_files.add(str(file_path))
78 try:
79 content = file_path.read_text(encoding='utf-8')
80 except Exception:
81 error_line = f"// Error reading file: {file_path}\n"
82 return error_line, [str(file_path)]
84 # Handle AsciiDoc includes: include::file.adoc[]
85 include_pattern = r'include::([^[\]]+)\[\]'
86 lines = content.split('\n')
87 result_lines = []
88 result_sources = []
90 for line in lines:
91 match = re.match(include_pattern, line)
92 if match:
93 include_file = match.group(1)
94 include_path = file_path.parent / include_file
95 included_content, included_sources = self._resolve_includes_with_sources(include_path, depth + 1)
97 # Add included lines and their sources
98 included_lines = included_content.split('\n')
99 result_lines.extend(included_lines)
100 result_sources.extend(included_sources)
101 else:
102 result_lines.append(line)
103 result_sources.append(str(file_path))
105 return '\n'.join(result_lines), result_sources
107 def _parse_structure(self, content: str, source_file: str) -> Dict[str, Section]:
108 """Parse document structure into hierarchical sections"""
109 lines = content.split('\n')
110 sections = {}
111 section_stack = []
112 current_content = []
113 line_num = 0
115 # Code block tracking (Issue #49)
116 in_code_block = False
117 code_block_delimiter = None
119 # Convert to relative path for portability
120 try:
121 from pathlib import Path
122 rel_source_file = str(Path(source_file).relative_to(self.root_path))
123 except (ValueError, AttributeError):
124 # If relative_to fails or root_path not set, use source_file as-is
125 rel_source_file = source_file
127 for i, line in enumerate(lines):
128 stripped_line = line.strip()
130 # Track code block boundaries (Issue #49)
131 # Detect block attributes: [source,python], [plantuml], etc.
132 if stripped_line.startswith('[') and (
133 'source' in stripped_line or
134 'plantuml' in stripped_line or
135 'listing' in stripped_line
136 ):
137 # Next delimiter will start a code block
138 current_content.append(line)
139 continue
141 # Detect code block delimiters: ----, ...., etc.
142 if stripped_line in ['----', '....', '====', '****']:
143 if in_code_block and stripped_line == code_block_delimiter:
144 # Exit code block
145 in_code_block = False
146 code_block_delimiter = None
147 elif not in_code_block:
148 # Enter code block
149 in_code_block = True
150 code_block_delimiter = stripped_line
151 current_content.append(line)
152 continue
154 # Skip header parsing if inside code block (Issue #49)
155 if in_code_block:
156 current_content.append(line)
157 continue
159 # AsciiDoc headers: = Title, == Title, etc.
160 # Markdown headers: # Title, ## Title, etc.
161 header_match = re.match(r'^(=+|#+)\s+(.+)$', stripped_line)
163 if header_match:
164 # Save previous section content
165 if section_stack:
166 section_stack[-1].content = '\n'.join(current_content).strip()
167 section_stack[-1].line_end = i - 1
169 # Determine level
170 level = len(header_match.group(1))
171 title = header_match.group(2).strip()
173 # Generate section ID
174 section_id = self._generate_section_id(title, level, section_stack)
176 # Create new section
177 section = Section(
178 id=section_id,
179 title=title,
180 level=level,
181 content="",
182 line_start=i,
183 line_end=i,
184 source_file=rel_source_file,
185 children=[]
186 )
188 # Manage hierarchy
189 while section_stack and section_stack[-1].level >= level:
190 section_stack.pop()
192 if section_stack:
193 section.parent_id = section_stack[-1].id
194 section_stack[-1].children.append(section_id)
196 section_stack.append(section)
197 sections[section_id] = section
198 current_content = []
199 else:
200 current_content.append(line)
202 # Handle last section
203 if section_stack:
204 section_stack[-1].content = '\n'.join(current_content).strip()
205 section_stack[-1].line_end = len(lines) - 1
207 return sections
209 def _parse_structure_with_sources(self, content: str, line_sources: List[str]) -> Dict[str, Section]:
210 """Parse document structure using source file mapping for each line"""
211 lines = content.split('\n')
212 sections = {}
213 section_stack = []
214 current_content = []
216 # Code block tracking (Issue #49)
217 in_code_block = False
218 code_block_delimiter = None
220 for i, line in enumerate(lines):
221 stripped_line = line.strip()
223 # Get source file for this line (with bounds checking)
224 line_source_file = line_sources[i] if i < len(line_sources) else line_sources[-1] if line_sources else "unknown"
226 # Track code block boundaries (Issue #49)
227 # Detect block attributes: [source,python], [plantuml], etc.
228 if stripped_line.startswith('[') and (
229 'source' in stripped_line or
230 'plantuml' in stripped_line or
231 'listing' in stripped_line
232 ):
233 # Next delimiter will start a code block
234 current_content.append(line)
235 continue
237 # Detect code block delimiters: ----, ...., etc.
238 if stripped_line in ['----', '....', '====', '****']:
239 if in_code_block and stripped_line == code_block_delimiter:
240 # Exit code block
241 in_code_block = False
242 code_block_delimiter = None
243 elif not in_code_block:
244 # Enter code block
245 in_code_block = True
246 code_block_delimiter = stripped_line
247 current_content.append(line)
248 continue
250 # Skip header parsing if inside code block (Issue #49)
251 if in_code_block:
252 current_content.append(line)
253 continue
255 # AsciiDoc headers: = Title, == Title, etc.
256 # Markdown headers: # Title, ## Title, etc.
257 header_match = re.match(r'^(=+|#+)\s+(.+)$', stripped_line)
259 if header_match:
260 # Save previous section content
261 if section_stack:
262 section_stack[-1].content = '\n'.join(current_content).strip()
263 section_stack[-1].line_end = i - 1
265 # Determine level
266 level = len(header_match.group(1))
267 title = header_match.group(2).strip()
269 # Generate section ID
270 section_id = self._generate_section_id(title, level, section_stack)
272 # Use the source file for this specific line (the header line)
273 section_source_file = line_source_file
275 # Create new section with proper source file
276 section = Section(
277 id=section_id,
278 title=title,
279 level=level,
280 content="",
281 line_start=i,
282 line_end=i,
283 source_file=section_source_file,
284 children=[],
285 document_position=i
286 )
288 # Manage hierarchy
289 while section_stack and section_stack[-1].level >= level:
290 section_stack.pop()
292 if section_stack:
293 section.parent_id = section_stack[-1].id
294 section_stack[-1].children.append(section_id)
296 section_stack.append(section)
297 sections[section_id] = section
298 current_content = []
299 else:
300 current_content.append(line)
302 # Handle last section
303 if section_stack:
304 section_stack[-1].content = '\n'.join(current_content).strip()
305 section_stack[-1].line_end = len(lines) - 1
307 return sections
309 def _generate_section_id(self, title: str, level: int, section_stack: List[Section]) -> str:
310 """Generate hierarchical section ID"""
311 # Clean title for ID
312 clean_title = re.sub(r'[^\w\s-]', '', title.lower()).replace(' ', '-')
314 # Build hierarchical path
315 path_parts = []
316 for section in section_stack:
317 if section.level < level:
318 path_parts.append(section.id.split('.')[-1])
320 path_parts.append(clean_title)
321 return '.'.join(path_parts)