Coverage for src/document

1import re

2import os

3from pathlib import Path

4from typing import Dict, List, Optional, Tuple

5from dataclasses import dataclass

7@dataclass

8class Section:

9 id: str

10 title: str

11 level: int

12 content: str

13 line_start: int

14 line_end: int

15 source_file: str # Relative path to source file

16 children: List[str] # Store child IDs instead of Section objects

17 parent_id: Optional[str] = None # Store parent ID instead of Section object

18 document_position: int = 0 # Position in resolved document for proper sorting

20class DocumentParser:

21 def __init__(self, max_include_depth: int = 4):

22 self.max_include_depth = max_include_depth

23 self.processed_files = set()

25 def parse_project(self, root_file: Path) -> Tuple[Dict[str, Section], set]:

26 """Parse a documentation project starting from root file

28 Returns:

29 Tuple of (sections_dict, included_files_set)

30 - sections_dict: Dictionary mapping section IDs to Section objects

31 - included_files_set: Set of Path objects for files that were included (excluding root_file)

32 """

33 self.processed_files.clear()

34 content, line_sources = self._resolve_includes_with_sources(root_file, 0)

35 sections = self._parse_structure_with_sources(content, line_sources)

37 # Get included files (all processed files except the root file itself)

38 included_files = {Path(f) for f in self.processed_files if Path(f) != root_file}

40 return sections, included_files

42 def _resolve_includes(self, file_path: Path, depth: int) -> str:

43 """Resolve include directives recursively"""

44 if depth >= self.max_include_depth or str(file_path) in self.processed_files:

45 return f"// Include depth limit reached or circular reference: {file_path}\n"

47 self.processed_files.add(str(file_path))

49 try:

50 content = file_path.read_text(encoding='utf-8')

51 except Exception:

52 return f"// Error reading file: {file_path}\n"

54 # Handle AsciiDoc includes: include::file.adoc[]

55 include_pattern = r'include::([^[\]]+)\[\]'

57 def replace_include(match):

58 include_file = match.group(1)

59 include_path = file_path.parent / include_file

60 return self._resolve_includes(include_path, depth + 1)

62 return re.sub(include_pattern, replace_include, content)

64 def _resolve_includes_with_sources(self, file_path: Path, depth: int) -> Tuple[str, List[str]]:

65 """Resolve include directives recursively while tracking source files for each line

67 Returns:

68 Tuple of (content, line_sources)

69 - content: Resolved content string

70 - line_sources: List where line_sources[i] is the source file path for line i

71 """

72 if depth >= self.max_include_depth or str(file_path) in self.processed_files:

73 error_line = f"// Include depth limit reached or circular reference: {file_path}\n"

74 return error_line, [str(file_path)]

76 self.processed_files.add(str(file_path))

78 try:

79 content = file_path.read_text(encoding='utf-8')

80 except Exception:

81 error_line = f"// Error reading file: {file_path}\n"

82 return error_line, [str(file_path)]

84 # Handle AsciiDoc includes: include::file.adoc[]

85 include_pattern = r'include::([^[\]]+)\[\]'

86 lines = content.split('\n')

87 result_lines = []

88 result_sources = []

90 for line in lines:

91 match = re.match(include_pattern, line)

92 if match:

93 include_file = match.group(1)

94 include_path = file_path.parent / include_file

95 included_content, included_sources = self._resolve_includes_with_sources(include_path, depth + 1)

97 # Add included lines and their sources

98 included_lines = included_content.split('\n')

99 result_lines.extend(included_lines)

100 result_sources.extend(included_sources)

101 else:

102 result_lines.append(line)

103 result_sources.append(str(file_path))

104

105 return '\n'.join(result_lines), result_sources

106

107 def _parse_structure(self, content: str, source_file: str) -> Dict[str, Section]:

108 """Parse document structure into hierarchical sections"""

109 lines = content.split('\n')

110 sections = {}

111 section_stack = []

112 current_content = []

113 line_num = 0

114

115 # Code block tracking (Issue #49)

116 in_code_block = False

117 code_block_delimiter = None

118

119 # Convert to relative path for portability

120 try:

121 from pathlib import Path

122 rel_source_file = str(Path(source_file).relative_to(self.root_path))

123 except (ValueError, AttributeError):

124 # If relative_to fails or root_path not set, use source_file as-is

125 rel_source_file = source_file

126

127 for i, line in enumerate(lines):

128 stripped_line = line.strip()

129

130 # Track code block boundaries (Issue #49)

131 # Detect block attributes: [source,python], [plantuml], etc.

132 if stripped_line.startswith('[') and (

133 'source' in stripped_line or

134 'plantuml' in stripped_line or

135 'listing' in stripped_line

136 ):

137 # Next delimiter will start a code block

138 current_content.append(line)

139 continue

140

141 # Detect code block delimiters: ----, ...., etc.

142 if stripped_line in ['----', '....', '====', '****']:

143 if in_code_block and stripped_line == code_block_delimiter:

144 # Exit code block

145 in_code_block = False

146 code_block_delimiter = None

147 elif not in_code_block:

148 # Enter code block

149 in_code_block = True

150 code_block_delimiter = stripped_line

151 current_content.append(line)

152 continue

153

154 # Skip header parsing if inside code block (Issue #49)

155 if in_code_block:

156 current_content.append(line)

157 continue

158

159 # AsciiDoc headers: = Title, == Title, etc.

160 # Markdown headers: # Title, ## Title, etc.

161 header_match = re.match(r'^(=+|#+)\s+(.+)$', stripped_line)

162

163 if header_match:

164 # Save previous section content

165 if section_stack:

166 section_stack[-1].content = '\n'.join(current_content).strip()

167 section_stack[-1].line_end = i - 1

168

169 # Determine level

170 level = len(header_match.group(1))

171 title = header_match.group(2).strip()

172

173 # Generate section ID

174 section_id = self._generate_section_id(title, level, section_stack)

175

176 # Create new section

177 section = Section(

178 id=section_id,

179 title=title,

180 level=level,

181 content="",

182 line_start=i,

183 line_end=i,

184 source_file=rel_source_file,

185 children=[]

186 )

187

188 # Manage hierarchy

189 while section_stack and section_stack[-1].level >= level:

190 section_stack.pop()

191

192 if section_stack:

193 section.parent_id = section_stack[-1].id

194 section_stack[-1].children.append(section_id)

195

196 section_stack.append(section)

197 sections[section_id] = section

198 current_content = []

199 else:

200 current_content.append(line)

201

202 # Handle last section

203 if section_stack:

204 section_stack[-1].content = '\n'.join(current_content).strip()

205 section_stack[-1].line_end = len(lines) - 1

206

207 return sections

208

209 def _parse_structure_with_sources(self, content: str, line_sources: List[str]) -> Dict[str, Section]:

210 """Parse document structure using source file mapping for each line"""

211 lines = content.split('\n')

212 sections = {}

213 section_stack = []

214 current_content = []

215

216 # Code block tracking (Issue #49)

217 in_code_block = False

218 code_block_delimiter = None

219

220 for i, line in enumerate(lines):

221 stripped_line = line.strip()

222

223 # Get source file for this line (with bounds checking)

224 line_source_file = line_sources[i] if i < len(line_sources) else line_sources[-1] if line_sources else "unknown"

225

226 # Track code block boundaries (Issue #49)

227 # Detect block attributes: [source,python], [plantuml], etc.

228 if stripped_line.startswith('[') and (

229 'source' in stripped_line or

230 'plantuml' in stripped_line or

231 'listing' in stripped_line

232 ):

233 # Next delimiter will start a code block

234 current_content.append(line)

235 continue

236

237 # Detect code block delimiters: ----, ...., etc.

238 if stripped_line in ['----', '....', '====', '****']:

239 if in_code_block and stripped_line == code_block_delimiter:

240 # Exit code block

241 in_code_block = False

242 code_block_delimiter = None

243 elif not in_code_block:

244 # Enter code block

245 in_code_block = True

246 code_block_delimiter = stripped_line

247 current_content.append(line)

248 continue

249

250 # Skip header parsing if inside code block (Issue #49)

251 if in_code_block:

252 current_content.append(line)

253 continue

254

255 # AsciiDoc headers: = Title, == Title, etc.

256 # Markdown headers: # Title, ## Title, etc.

257 header_match = re.match(r'^(=+|#+)\s+(.+)$', stripped_line)

258

259 if header_match:

260 # Save previous section content

261 if section_stack:

262 section_stack[-1].content = '\n'.join(current_content).strip()

263 section_stack[-1].line_end = i - 1

264

265 # Determine level

266 level = len(header_match.group(1))

267 title = header_match.group(2).strip()

268

269 # Generate section ID

270 section_id = self._generate_section_id(title, level, section_stack)

271

272 # Use the source file for this specific line (the header line)

273 section_source_file = line_source_file

274

275 # Create new section with proper source file

276 section = Section(

277 id=section_id,

278 title=title,

279 level=level,

280 content="",

281 line_start=i,

282 line_end=i,

283 source_file=section_source_file,

284 children=[],

285 document_position=i

286 )

287

288 # Manage hierarchy

289 while section_stack and section_stack[-1].level >= level:

290 section_stack.pop()

291

292 if section_stack:

293 section.parent_id = section_stack[-1].id

294 section_stack[-1].children.append(section_id)

295

296 section_stack.append(section)

297 sections[section_id] = section

298 current_content = []

299 else:

300 current_content.append(line)

301

302 # Handle last section

303 if section_stack:

304 section_stack[-1].content = '\n'.join(current_content).strip()

305 section_stack[-1].line_end = len(lines) - 1

306

307 return sections

308

309 def _generate_section_id(self, title: str, level: int, section_stack: List[Section]) -> str:

310 """Generate hierarchical section ID"""

311 # Clean title for ID

312 clean_title = re.sub(r'[^\w\s-]', '', title.lower()).replace(' ', '-')

313

314 # Build hierarchical path

315 path_parts = []

316 for section in section_stack:

317 if section.level < level:

318 path_parts.append(section.id.split('.')[-1])

319

320 path_parts.append(clean_title)

321 return '.'.join(path_parts)

Coverage for src/document_parser.py: 63%

175 statements