this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

330 lines
13 KiB

1 year ago
  1. __author__ = 'DarkWeb'
  2. import codecs
  3. import glob
  4. import os
  5. import shutil
  6. from Forums.DB_Connection.db_connection import *
  7. from Forums.BestCardingWorld.parser import *
  8. from Forums.CryptBB.parser import *
  9. from Forums.DWForums.parser import *
  10. from Forums.Dread.parser import *
  11. from Forums.Helium.parser import *
  12. # from Forums.Nulled.parser import *
  13. from Forums.Classifier.classify_product import predict
  14. #from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
  15. # determines if forum is russian, not really used now but maybe later
  16. def isRussianForum(forum):
  17. with open('russian_forums.txt') as f:
  18. forums = f.readlines()
  19. result = False
  20. for iforum in forums:
  21. iforum = iforum.replace('\n','')
  22. if iforum == forum:
  23. result = True
  24. break
  25. return result
  26. #tries to match description pages to listing pages by using a key made for every description page and every link in listing page
  27. #once verified and matched, the info is merged into a 'rec', which is returned
  28. #@param: detPage is a list of keys of valid pages, rec is the row of data of an instance
  29. #return: rec, row of data, that may have additional data added on after matching description to listing page
  30. def mergePages(rmm, rec):
  31. # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
  32. # key = rec[16]
  33. print ("----------------- Matched: " + rec[1] + "--------------------")
  34. rec[8] = rmm[1]
  35. rec[9] = rmm[2]
  36. rec[10] = rmm[3]
  37. rec[11] = rmm[4]
  38. rec[12] = rmm[5]
  39. rec[13] = rmm[6]
  40. rec[14] = rmm[7]
  41. rec[15] = rmm[8]
  42. return rec
  43. #gets a string of posts and joins them together into one string to be put in the database as one string of text
  44. #@param: list of strings (the posts of a thread)
  45. #return: string containing the concatenation of all the strings
  46. def getPosts(posts):
  47. strPosts = ' '.join(posts)
  48. return strPosts.strip()
  49. #uses db connection , another program, methods to persists values to the correct categories
  50. #@param: row is the list of entries for this instance, cur is the db connection object
  51. def persist_data(row, cur):
  52. user = create_user(cur, row[5])
  53. forum = create_forum(cur, row)
  54. board = create_board(cur, row, forum)
  55. topic = create_topic(cur, row, forum, board, user)
  56. create_posts(cur, row, forum, board, topic)
  57. #main method for this program, what actually gets the parsed info from the parser, and persists them into the db
  58. #calls the different parser methods here depending on the type of html page
  59. def new_parse(forum, createLog):
  60. print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.")
  61. crawlerDate = date.today()
  62. ini = time.time()
  63. global site
  64. # Connecting to the database
  65. con = connectDataBase()
  66. cur = con.cursor()
  67. # Creating the tables (The database should be created manually)
  68. create_database(cur, con)
  69. nError = 0
  70. lines = [] #lines.clear()
  71. lns = [] #lns.clear()
  72. detPage = {}
  73. rw = []
  74. # Creating the log file for each Forum
  75. if createLog:
  76. if not os.path.exists("./" + forum + "/Logs/" + forum + "_" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + ".log"):
  77. logFile = open("./" + forum + "/Logs/" + forum + "_" + str("%02d" %crawlerDate.today().month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + ".log", "w")
  78. else:
  79. print("Files of the date " + str("%02d" %crawlerDate.today().month) + str("%02d" %crawlerDate.today().day) + str("%04d" %crawlerDate.today().year) +
  80. " from the Forum " + forum + " were already read. Delete the referent information in the Data Base and also delete the log file "
  81. "in the _Logs folder to read files from this Forum of this date again.")
  82. raise SystemExit
  83. # Reading the Listing Html Pages
  84. for fileListing in glob.glob(os.path.join (os.getcwd().replace("Initialization","") + forum + "\\HTML_Pages\\" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + "\\Listing" ,'*.html')):
  85. lines.append(fileListing)
  86. # Reading the Description Html Pages
  87. for fileDescription in glob.glob(os.path.join (os.getcwd().replace("Initialization","") + forum + "\\HTML_Pages\\" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + "\\Description" ,'*.html')):
  88. lns.append(fileDescription)
  89. # Parsing the Description Pages and put the tag's content into a dictionary (Hash table)
  90. for index, line2 in enumerate(lns):
  91. print("Reading description folder of '" + forum + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns)))
  92. try:
  93. html = codecs.open(line2.strip('\n'), encoding='utf8')
  94. soup = BeautifulSoup(html, "html.parser")
  95. html.close()
  96. except:
  97. try:
  98. html = open(line2.strip('\n'))
  99. soup = BeautifulSoup(html, "html.parser")
  100. html.close()
  101. except:
  102. nError += 1
  103. print("There was a problem to read the file " + line2 + " in the Description section!")
  104. if createLog:
  105. logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section!\n")
  106. continue
  107. try:
  108. if forum == "BestCardingWorld":
  109. rmm = bestcardingworld_description_parser(soup)
  110. elif forum == "CryptBB":
  111. rmm = cryptBB_description_parser(soup)
  112. elif forum == "DWForums":
  113. rmm = dwForums_description_parser(soup)
  114. elif forum == "Dread":
  115. rmm = dread_description_parser(soup)
  116. elif forum == "Helium":
  117. rmm = helium_description_parser(soup)
  118. # elif forum == "Nulled":
  119. # rmm = nulled_description_parser(soup)
  120. # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
  121. key = u"Url:" + os.path.basename(line2).replace(".html", "")
  122. # save file address with description record in memory
  123. detPage[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
  124. except:
  125. nError += 1
  126. print("There was a problem to parse the file " + line2 + " in the Description section!")
  127. if createLog:
  128. logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n")
  129. # Parsing the Listing Pages and put the tag's content into a list
  130. for index, line1 in enumerate(lines):
  131. print("Reading listing folder of '" + forum + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines)))
  132. readError = False
  133. try:
  134. html = codecs.open(line1.strip('\n'), encoding='utf8')
  135. soup = BeautifulSoup(html, "html.parser")
  136. html.close()
  137. except:
  138. try:
  139. html = open(line1.strip('\n'))
  140. soup = BeautifulSoup(html, "html.parser")
  141. html.close()
  142. except:
  143. nError += 1
  144. print("There was a problem to read the file " + line1 + " in the Listing section!")
  145. if createLog:
  146. logFile.write(str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n")
  147. readError = True
  148. if not readError:
  149. parseError = False
  150. try:
  151. if forum == "BestCardingWorld":
  152. rw = bestcardingworld_listing_parser(soup)
  153. elif forum == "CryptBB":
  154. rw = cryptBB_listing_parser(soup)
  155. elif forum == "DWForums":
  156. rw = dwForums_listing_parser(soup)
  157. elif forum == "Dread":
  158. rw = dread_listing_parser(soup)
  159. elif forum == "Helium":
  160. rw = helium_listing_parser(soup)
  161. # elif forum == "Nulled":
  162. # rw = nulled_listing_parser(soup)
  163. except:
  164. nError += 1
  165. print("There was a problem to read the file " + line1 + " in the listing section!")
  166. traceback.print_exc()
  167. if createLog:
  168. logFile.write(
  169. str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n")
  170. parseError = True
  171. if not parseError:
  172. persistError = False
  173. moveError = False
  174. num_in_db = 0
  175. num_persisted_moved = 0
  176. for rec in rw:
  177. rec = rec.split(',')
  178. # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
  179. # key = rec[16]
  180. url = ''.join(e for e in rec[16] if e.isalnum())
  181. key = u"Url:" + url
  182. if key in detPage:
  183. # Combining the information from Listing and Description Pages
  184. rmm = detPage[key]['rmm']
  185. rec = mergePages(rmm, rec)
  186. # Append to the list the classification of the topic
  187. # if isRussianForum(forum):
  188. # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_russian')))
  189. # else:
  190. # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english')))
  191. rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english')))
  192. # Persisting the information in the database
  193. try:
  194. persist_data(tuple(rec), cur)
  195. con.commit()
  196. except:
  197. trace = traceback.format_exc()
  198. if trace.find("already exists") == -1:
  199. nError += 1
  200. print("There was a problem to persist the file " + detPage[key]['filename'] + " in the database!")
  201. if createLog:
  202. logFile.write(
  203. str(nError) + ". There was a problem to persist the file " + detPage[key]['filename'] + " in the database.\n")
  204. persistError = True
  205. con.rollback()
  206. if not persistError:
  207. # move description files of completed folder
  208. source = line2.replace(os.path.basename(line2), "") + detPage[key]['filename']
  209. destination = line2.replace(os.path.basename(line2), "") + r'Read/'
  210. try:
  211. shutil.move(source, destination)
  212. num_persisted_moved += 1
  213. except:
  214. print("There was a problem to move the file " + detPage[key]['filename'] + " in the Description section!")
  215. nError += 1
  216. if createLog:
  217. logFile.write(
  218. str(nError) + ". There was a problem to move the file " + detPage[key]['filename'] + " in the Description section!.\n")
  219. moveError = True
  220. # if the associated description page is not read or not parsed
  221. else:
  222. # query database
  223. # if the post already exists:
  224. # num_in_db += 1
  225. pass
  226. # if number of topics on listing page is equal to
  227. # the number of merged, persisted, and moved topics plus
  228. # the number of topics already in the database
  229. if not persistError and not moveError and len(rw) == (num_persisted_moved + num_in_db):
  230. # move listing file to completed folder
  231. source = line1
  232. destination = line1.replace(os.path.basename(line1), "") + r'Read/'
  233. try:
  234. shutil.move(source, destination)
  235. except:
  236. nError += 1
  237. print("There was a problem to move the file " + line1 + " in the Listing section!")
  238. if createLog:
  239. logFile.write(str(nError) + ". There was a problem to move the file " + line1 + " in the Listing section!.\n")
  240. if createLog:
  241. logFile.close()
  242. #end = time.time()
  243. #finalTime = float(end-ini)
  244. #print (forum + " Parsing Perfomed Succesfully in %.2f" %finalTime + "!")
  245. input("Parsing the " + forum + " forum and data classification done successfully. Press ENTER to continue\n")