|
@ -283,94 +283,3 @@ def BlackPyramid_links_parser(soup): |
|
|
href.append(link) |
|
|
href.append(link) |
|
|
|
|
|
|
|
|
return href |
|
|
return href |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import glob |
|
|
|
|
|
import os |
|
|
|
|
|
import codecs |
|
|
|
|
|
import shutil |
|
|
|
|
|
import traceback |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
nError = 0 |
|
|
|
|
|
marketPlace = 'BlackPyramid' |
|
|
|
|
|
|
|
|
|
|
|
lines = [] # listing pages |
|
|
|
|
|
lns = [] # description pages |
|
|
|
|
|
detPage = {} |
|
|
|
|
|
|
|
|
|
|
|
''' |
|
|
|
|
|
# reading description pages |
|
|
|
|
|
count = 0 |
|
|
|
|
|
for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\10222023\\Description", '*.html')): |
|
|
|
|
|
count += 1 |
|
|
|
|
|
lns.append(fileDescription) |
|
|
|
|
|
# if count > 5: |
|
|
|
|
|
# break |
|
|
|
|
|
|
|
|
|
|
|
for index, line2 in enumerate(lns): |
|
|
|
|
|
|
|
|
|
|
|
print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
html = codecs.open(line2.strip('\n'), encoding='utf8') |
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
|
|
html.close() |
|
|
|
|
|
except: |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
html = open(line2.strip('\n')) |
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
|
|
html.close() |
|
|
|
|
|
except: |
|
|
|
|
|
|
|
|
|
|
|
nError += 1 |
|
|
|
|
|
print("There was a problem to read the file " + line2 + " in the Description section!") |
|
|
|
|
|
# if createLog: |
|
|
|
|
|
# logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n") |
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
print(BlackPyramid_description_parser(soup)) |
|
|
|
|
|
except: |
|
|
|
|
|
traceback.print_exc() |
|
|
|
|
|
print("There was a problem to parse the file " + line2 + " in the Description section!") |
|
|
|
|
|
''' |
|
|
|
|
|
# reading listing pages |
|
|
|
|
|
count = 0 |
|
|
|
|
|
for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\10222023\\Listing", '*.html')): |
|
|
|
|
|
count += 1 |
|
|
|
|
|
lines.append(fileListing) |
|
|
|
|
|
# if count > 1: |
|
|
|
|
|
# break |
|
|
|
|
|
|
|
|
|
|
|
for index, line1 in enumerate(lines): |
|
|
|
|
|
|
|
|
|
|
|
print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str( |
|
|
|
|
|
index + 1) + " ... " + str(len(lines))) |
|
|
|
|
|
|
|
|
|
|
|
readError = False |
|
|
|
|
|
try: |
|
|
|
|
|
html = codecs.open(line1.strip('\n'), encoding='utf8') |
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
|
|
html.close() |
|
|
|
|
|
except: |
|
|
|
|
|
try: |
|
|
|
|
|
html = open(line1.strip('\n')) |
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
|
|
html.close() |
|
|
|
|
|
except: |
|
|
|
|
|
print("There was a problem to read the file " + line1 + " in the Listing section!") |
|
|
|
|
|
readError = True |
|
|
|
|
|
|
|
|
|
|
|
if not readError: |
|
|
|
|
|
|
|
|
|
|
|
parseError = False |
|
|
|
|
|
try: |
|
|
|
|
|
print(BlackPyramid_listing_parser(soup)) |
|
|
|
|
|
except: |
|
|
|
|
|
traceback.print_exc() |
|
|
|
|
|
print("There was a problem to parse the file " + line1 + " in the listing section!") |
|
|
|
|
|
parseError = True |
|
|
|
|
|
|
|
|
|
|
|
print("DONE") |
|
|
|