@@ -97,9 +97,20 @@ def _process_problem_data(self, question):
97
97
}
98
98
99
99
# Process content with BeautifulSoup to extract description, examples, and constraints
100
+ import os
100
101
content_html = question .get ('content' , '' )
102
+ debug_dir = os .path .dirname (os .path .abspath (__file__ ))
103
+ debug_content_path = os .path .join (debug_dir , 'debug_content_html.txt' )
104
+ debug_soup_path = os .path .join (debug_dir , 'debug_soup.html' )
105
+ # Write content_html to a debug file for inspection
106
+ with open (debug_content_path , 'w' , encoding = 'utf-8' ) as f :
107
+ f .write (content_html )
108
+ print (f"[DEBUG] Wrote content_html to { debug_content_path } " )
101
109
soup = BeautifulSoup (content_html , 'html.parser' )
102
-
110
+ # Write soup prettified HTML to a debug file for inspection
111
+ with open (debug_soup_path , 'w' , encoding = 'utf-8' ) as f :
112
+ f .write (soup .prettify ())
113
+ print (f"[DEBUG] Wrote soup HTML to { debug_soup_path } " )
103
114
# Get description (text before the first <strong>Example</strong>)
104
115
description = []
105
116
current_element = soup .find ()
@@ -113,14 +124,23 @@ def _process_problem_data(self, question):
113
124
114
125
problem_data ['description' ] = '\n ' .join ([d for d in description if d ])
115
126
116
- # Extract examples
127
+ # Extract examples and attach the closest preceding image to each
117
128
examples = []
118
129
example_blocks = soup .find_all ('pre' )
119
130
for i , example in enumerate (example_blocks , 1 ):
120
- examples . append ( {
131
+ example_dict = {
121
132
'example_num' : i ,
122
- 'example_text' : example .get_text ().strip ()
123
- })
133
+ 'example_text' : example .get_text ().strip (),
134
+ 'images' : []
135
+ }
136
+ # Find the closest preceding <img> tag before this <pre>
137
+ prev = example .previous_element
138
+ while prev :
139
+ if getattr (prev , 'name' , None ) == 'img' and prev .has_attr ('src' ):
140
+ example_dict ['images' ].append (prev ['src' ])
141
+ break
142
+ prev = prev .previous_element
143
+ examples .append (example_dict )
124
144
problem_data ['examples' ] = examples
125
145
126
146
# Extract constraints
@@ -212,18 +232,22 @@ def scrape_problem_list(self, limit=10):
212
232
213
233
return problem_list
214
234
215
- if __name__ == "__main__" :
216
- scraper = LeetCodeScraper ()
235
+ # if __name__ == "__main__":
236
+ # scraper = LeetCodeScraper()
217
237
218
238
# Option 1: Scrape a specific problem
219
239
# problem_data = scraper.scrape_problem("two-sum")
220
240
# print(json.dumps(problem_data, indent=2))
221
241
242
+ if __name__ == "__main__" :
243
+ scraper = LeetCodeScraper ()
244
+ problem_data = scraper .scrape_problem ("linked-list-cycle" )
245
+ print (json .dumps (problem_data , indent = 2 ))
222
246
# Option 2: Scrape multiple problems from the list
223
- problem_list = scraper .scrape_problem_list (limit = 5 )
247
+ # problem_list = scraper.scrape_problem_list(limit=5)
224
248
225
- # Add a delay between requests to avoid being blocked
226
- for problem in problem_list :
227
- print (f"Scraping problem: { problem ['title' ]} ({ problem ['slug' ]} )" )
228
- scraper .scrape_problem (problem ['slug' ])
229
- time .sleep (2 ) # Wait 2 seconds between requests
249
+ # # Add a delay between requests to avoid being blocked
250
+ # for problem in problem_list:
251
+ # print(f"Scraping problem: {problem['title']} ({problem['slug']})")
252
+ # scraper.scrape_problem(problem['slug'])
253
+ # time.sleep(2) # Wait 2 seconds between requests
0 commit comments