Coverage for src/document_parser.py: 63%

175 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-08 05:40 +0000

1import re 

2import os 

3from pathlib import Path 

4from typing import Dict, List, Optional, Tuple 

5from dataclasses import dataclass 

6 

7@dataclass 

8class Section: 

9 id: str 

10 title: str 

11 level: int 

12 content: str 

13 line_start: int 

14 line_end: int 

15 source_file: str # Relative path to source file 

16 children: List[str] # Store child IDs instead of Section objects 

17 parent_id: Optional[str] = None # Store parent ID instead of Section object 

18 document_position: int = 0 # Position in resolved document for proper sorting 

19 

20class DocumentParser: 

21 def __init__(self, max_include_depth: int = 4): 

22 self.max_include_depth = max_include_depth 

23 self.processed_files = set() 

24 

25 def parse_project(self, root_file: Path) -> Tuple[Dict[str, Section], set]: 

26 """Parse a documentation project starting from root file 

27 

28 Returns: 

29 Tuple of (sections_dict, included_files_set) 

30 - sections_dict: Dictionary mapping section IDs to Section objects 

31 - included_files_set: Set of Path objects for files that were included (excluding root_file) 

32 """ 

33 self.processed_files.clear() 

34 content, line_sources = self._resolve_includes_with_sources(root_file, 0) 

35 sections = self._parse_structure_with_sources(content, line_sources) 

36 

37 # Get included files (all processed files except the root file itself) 

38 included_files = {Path(f) for f in self.processed_files if Path(f) != root_file} 

39 

40 return sections, included_files 

41 

42 def _resolve_includes(self, file_path: Path, depth: int) -> str: 

43 """Resolve include directives recursively""" 

44 if depth >= self.max_include_depth or str(file_path) in self.processed_files: 

45 return f"// Include depth limit reached or circular reference: {file_path}\n" 

46 

47 self.processed_files.add(str(file_path)) 

48 

49 try: 

50 content = file_path.read_text(encoding='utf-8') 

51 except Exception: 

52 return f"// Error reading file: {file_path}\n" 

53 

54 # Handle AsciiDoc includes: include::file.adoc[] 

55 include_pattern = r'include::([^[\]]+)\[\]' 

56 

57 def replace_include(match): 

58 include_file = match.group(1) 

59 include_path = file_path.parent / include_file 

60 return self._resolve_includes(include_path, depth + 1) 

61 

62 return re.sub(include_pattern, replace_include, content) 

63 

64 def _resolve_includes_with_sources(self, file_path: Path, depth: int) -> Tuple[str, List[str]]: 

65 """Resolve include directives recursively while tracking source files for each line 

66  

67 Returns: 

68 Tuple of (content, line_sources) 

69 - content: Resolved content string 

70 - line_sources: List where line_sources[i] is the source file path for line i 

71 """ 

72 if depth >= self.max_include_depth or str(file_path) in self.processed_files: 

73 error_line = f"// Include depth limit reached or circular reference: {file_path}\n" 

74 return error_line, [str(file_path)] 

75 

76 self.processed_files.add(str(file_path)) 

77 

78 try: 

79 content = file_path.read_text(encoding='utf-8') 

80 except Exception: 

81 error_line = f"// Error reading file: {file_path}\n" 

82 return error_line, [str(file_path)] 

83 

84 # Handle AsciiDoc includes: include::file.adoc[] 

85 include_pattern = r'include::([^[\]]+)\[\]' 

86 lines = content.split('\n') 

87 result_lines = [] 

88 result_sources = [] 

89 

90 for line in lines: 

91 match = re.match(include_pattern, line) 

92 if match: 

93 include_file = match.group(1) 

94 include_path = file_path.parent / include_file 

95 included_content, included_sources = self._resolve_includes_with_sources(include_path, depth + 1) 

96 

97 # Add included lines and their sources 

98 included_lines = included_content.split('\n') 

99 result_lines.extend(included_lines) 

100 result_sources.extend(included_sources) 

101 else: 

102 result_lines.append(line) 

103 result_sources.append(str(file_path)) 

104 

105 return '\n'.join(result_lines), result_sources 

106 

107 def _parse_structure(self, content: str, source_file: str) -> Dict[str, Section]: 

108 """Parse document structure into hierarchical sections""" 

109 lines = content.split('\n') 

110 sections = {} 

111 section_stack = [] 

112 current_content = [] 

113 line_num = 0 

114 

115 # Code block tracking (Issue #49) 

116 in_code_block = False 

117 code_block_delimiter = None 

118 

119 # Convert to relative path for portability 

120 try: 

121 from pathlib import Path 

122 rel_source_file = str(Path(source_file).relative_to(self.root_path)) 

123 except (ValueError, AttributeError): 

124 # If relative_to fails or root_path not set, use source_file as-is 

125 rel_source_file = source_file 

126 

127 for i, line in enumerate(lines): 

128 stripped_line = line.strip() 

129 

130 # Track code block boundaries (Issue #49) 

131 # Detect block attributes: [source,python], [plantuml], etc. 

132 if stripped_line.startswith('[') and ( 

133 'source' in stripped_line or 

134 'plantuml' in stripped_line or 

135 'listing' in stripped_line 

136 ): 

137 # Next delimiter will start a code block 

138 current_content.append(line) 

139 continue 

140 

141 # Detect code block delimiters: ----, ...., etc. 

142 if stripped_line in ['----', '....', '====', '****']: 

143 if in_code_block and stripped_line == code_block_delimiter: 

144 # Exit code block 

145 in_code_block = False 

146 code_block_delimiter = None 

147 elif not in_code_block: 

148 # Enter code block 

149 in_code_block = True 

150 code_block_delimiter = stripped_line 

151 current_content.append(line) 

152 continue 

153 

154 # Skip header parsing if inside code block (Issue #49) 

155 if in_code_block: 

156 current_content.append(line) 

157 continue 

158 

159 # AsciiDoc headers: = Title, == Title, etc. 

160 # Markdown headers: # Title, ## Title, etc. 

161 header_match = re.match(r'^(=+|#+)\s+(.+)$', stripped_line) 

162 

163 if header_match: 

164 # Save previous section content 

165 if section_stack: 

166 section_stack[-1].content = '\n'.join(current_content).strip() 

167 section_stack[-1].line_end = i - 1 

168 

169 # Determine level 

170 level = len(header_match.group(1)) 

171 title = header_match.group(2).strip() 

172 

173 # Generate section ID 

174 section_id = self._generate_section_id(title, level, section_stack) 

175 

176 # Create new section 

177 section = Section( 

178 id=section_id, 

179 title=title, 

180 level=level, 

181 content="", 

182 line_start=i, 

183 line_end=i, 

184 source_file=rel_source_file, 

185 children=[] 

186 ) 

187 

188 # Manage hierarchy 

189 while section_stack and section_stack[-1].level >= level: 

190 section_stack.pop() 

191 

192 if section_stack: 

193 section.parent_id = section_stack[-1].id 

194 section_stack[-1].children.append(section_id) 

195 

196 section_stack.append(section) 

197 sections[section_id] = section 

198 current_content = [] 

199 else: 

200 current_content.append(line) 

201 

202 # Handle last section 

203 if section_stack: 

204 section_stack[-1].content = '\n'.join(current_content).strip() 

205 section_stack[-1].line_end = len(lines) - 1 

206 

207 return sections 

208 

209 def _parse_structure_with_sources(self, content: str, line_sources: List[str]) -> Dict[str, Section]: 

210 """Parse document structure using source file mapping for each line""" 

211 lines = content.split('\n') 

212 sections = {} 

213 section_stack = [] 

214 current_content = [] 

215 

216 # Code block tracking (Issue #49) 

217 in_code_block = False 

218 code_block_delimiter = None 

219 

220 for i, line in enumerate(lines): 

221 stripped_line = line.strip() 

222 

223 # Get source file for this line (with bounds checking) 

224 line_source_file = line_sources[i] if i < len(line_sources) else line_sources[-1] if line_sources else "unknown" 

225 

226 # Track code block boundaries (Issue #49) 

227 # Detect block attributes: [source,python], [plantuml], etc. 

228 if stripped_line.startswith('[') and ( 

229 'source' in stripped_line or 

230 'plantuml' in stripped_line or 

231 'listing' in stripped_line 

232 ): 

233 # Next delimiter will start a code block 

234 current_content.append(line) 

235 continue 

236 

237 # Detect code block delimiters: ----, ...., etc. 

238 if stripped_line in ['----', '....', '====', '****']: 

239 if in_code_block and stripped_line == code_block_delimiter: 

240 # Exit code block 

241 in_code_block = False 

242 code_block_delimiter = None 

243 elif not in_code_block: 

244 # Enter code block 

245 in_code_block = True 

246 code_block_delimiter = stripped_line 

247 current_content.append(line) 

248 continue 

249 

250 # Skip header parsing if inside code block (Issue #49) 

251 if in_code_block: 

252 current_content.append(line) 

253 continue 

254 

255 # AsciiDoc headers: = Title, == Title, etc. 

256 # Markdown headers: # Title, ## Title, etc. 

257 header_match = re.match(r'^(=+|#+)\s+(.+)$', stripped_line) 

258 

259 if header_match: 

260 # Save previous section content 

261 if section_stack: 

262 section_stack[-1].content = '\n'.join(current_content).strip() 

263 section_stack[-1].line_end = i - 1 

264 

265 # Determine level 

266 level = len(header_match.group(1)) 

267 title = header_match.group(2).strip() 

268 

269 # Generate section ID 

270 section_id = self._generate_section_id(title, level, section_stack) 

271 

272 # Use the source file for this specific line (the header line) 

273 section_source_file = line_source_file 

274 

275 # Create new section with proper source file 

276 section = Section( 

277 id=section_id, 

278 title=title, 

279 level=level, 

280 content="", 

281 line_start=i, 

282 line_end=i, 

283 source_file=section_source_file, 

284 children=[], 

285 document_position=i 

286 ) 

287 

288 # Manage hierarchy 

289 while section_stack and section_stack[-1].level >= level: 

290 section_stack.pop() 

291 

292 if section_stack: 

293 section.parent_id = section_stack[-1].id 

294 section_stack[-1].children.append(section_id) 

295 

296 section_stack.append(section) 

297 sections[section_id] = section 

298 current_content = [] 

299 else: 

300 current_content.append(line) 

301 

302 # Handle last section 

303 if section_stack: 

304 section_stack[-1].content = '\n'.join(current_content).strip() 

305 section_stack[-1].line_end = len(lines) - 1 

306 

307 return sections 

308 

309 def _generate_section_id(self, title: str, level: int, section_stack: List[Section]) -> str: 

310 """Generate hierarchical section ID""" 

311 # Clean title for ID 

312 clean_title = re.sub(r'[^\w\s-]', '', title.lower()).replace(' ', '-') 

313 

314 # Build hierarchical path 

315 path_parts = [] 

316 for section in section_stack: 

317 if section.level < level: 

318 path_parts.append(section.id.split('.')[-1]) 

319 

320 path_parts.append(clean_title) 

321 return '.'.join(path_parts)