this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

354 lines
14 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. import codecs
  3. import glob
  4. import os, re
  5. import shutil
  6. from Forums.DB_Connection.db_connection import *
  7. from Forums.BestCardingWorld.parser import *
  8. from Forums.CryptBB.parser import *
  9. from Forums.OnniForums.parser import *
  10. from Forums.Altenens.parser import *
  11. from Forums.Procrax.parser import *
  12. from Forums.Classifier.classify_product import predict
  13. # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
  14. # determines if forum is russian, not really used now but maybe later
  15. def isRussianForum(forum):
  16. with open('russian_forums.txt') as f:
  17. forums = f.readlines()
  18. result = False
  19. for iforum in forums:
  20. iforum = iforum.replace('\n','')
  21. if iforum == forum:
  22. result = True
  23. break
  24. return result
  25. #tries to match description pages to listing pages by using a key made for every description page and every link in listing page
  26. #once verified and matched, the info is merged into a 'rec', which is returned
  27. #@param: detPage is a list of keys of valid pages, rec is the row of data of an instance
  28. #return: rec, row of data, that may have additional data added on after matching description to listing page
  29. def mergePages(rmm, rec):
  30. # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
  31. # key = rec[16]
  32. print ("----------------- Matched: " + rec[3] + "--------------------")
  33. rec[9] = rmm[1]
  34. rec[10] = rmm[2]
  35. rec[11] = rmm[3]
  36. rec[12] = rmm[4]
  37. rec[13] = rmm[5]
  38. rec[14] = rmm[6]
  39. rec[15] = rmm[7]
  40. rec[16] = rmm[8]
  41. return rec
  42. #gets a string of posts and joins them together into one string to be put in the database as one string of text
  43. #@param: list of strings (the posts of a thread)
  44. #return: string containing the concatenation of all the strings
  45. def getPosts(posts):
  46. strPosts = ' '.join(posts)
  47. return strPosts.strip()
  48. #uses db connection , another program, methods to persists values to the correct categories
  49. #@param: row is the list of entries for this instance, cur is the db connection object
  50. def persist_data(url, row, cur):
  51. forum = create_forum(cur, row, url)
  52. board = create_board(cur, row, forum)
  53. author = create_user(cur, row, forum, 0)
  54. topic = create_topic(cur, row, forum, board, author)
  55. create_posts(cur, row, forum, board, topic)
  56. #main method for this program, what actually gets the parsed info from the parser, and persists them into the db
  57. #calls the different parser methods here depending on the type of html page
  58. def new_parse(forum, url, createLog):
  59. from Forums.Initialization.forums_mining import config, CURRENT_DATE
  60. print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.")
  61. # ini = time.time()
  62. # Connecting to the database
  63. con = connectDataBase()
  64. cur = con.cursor()
  65. # Creating the tables (The database should be created manually)
  66. create_database(cur, con)
  67. nError = 0
  68. lines = [] # listing pages
  69. lns = [] # description pages
  70. detPage = {} # first pages
  71. other = {} # other pages
  72. # Creating the log file for each Forum
  73. if createLog:
  74. if not os.path.exists("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log"):
  75. logFile = open("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log", "w")
  76. else:
  77. print("Files of the date " + CURRENT_DATE + " from the Forum " + forum +
  78. " were already read. Delete the referent information in the Data Base and also delete the log file"
  79. " in the _Logs folder to read files from this Forum of this date again.")
  80. raise SystemExit
  81. mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + forum + "/HTML_Pages")
  82. # Reading the Listing Html Pages
  83. for fileListing in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')):
  84. lines.append(fileListing)
  85. # Reading the Description Html Pages
  86. for fileDescription in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", '*.html')):
  87. lns.append(fileDescription)
  88. # Parsing the Description Pages and put the tag's content into a dictionary (Hash table)
  89. for index, line2 in enumerate(lns):
  90. print("Reading description folder of '" + forum + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns)))
  91. try:
  92. html = codecs.open(line2.strip('\n'), encoding='utf8')
  93. soup = BeautifulSoup(html, "html.parser")
  94. html.close()
  95. except:
  96. try:
  97. html = open(line2.strip('\n'))
  98. soup = BeautifulSoup(html, "html.parser")
  99. html.close()
  100. except:
  101. nError += 1
  102. print("There was a problem to read the file " + line2 + " in the Description section!")
  103. if createLog:
  104. logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section!\n")
  105. continue
  106. try:
  107. if forum == "BestCardingWorld":
  108. rmm = bestcardingworld_description_parser(soup)
  109. elif forum == "CryptBB":
  110. rmm = cryptBB_description_parser(soup)
  111. elif forum == "OnniForums":
  112. rmm = onniForums_description_parser(soup)
  113. elif forum == "Altenens":
  114. rmm = altenens_description_parser(soup)
  115. elif forum == "Procrax":
  116. rmm = procrax_description_parser(soup)
  117. # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
  118. key = u"Url:" + os.path.basename(line2).replace(".html", "")
  119. # check if "page1" exists at the end of a string
  120. # if yes add to first page directory if no add to other
  121. check = re.compile(r'page1$')
  122. if check.search(key):
  123. # print(key, 'is a first page\n')
  124. detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]}
  125. else:
  126. # print(key, 'is an other page\n')
  127. other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
  128. except:
  129. nError += 1
  130. print("There was a problem to parse the file " + line2 + " in the Description section!")
  131. traceback.print_exc()
  132. if createLog:
  133. logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n")
  134. # goes through keys from detPage and other, checks if the keys match.
  135. # if yes adds other[key] values to detPage w/o overwritting
  136. for key in detPage.keys():
  137. for k in list(other.keys()):
  138. checkkey = str(key[4:])
  139. checkk = str(k[4:])
  140. if checkkey in checkk:
  141. detPage[key]['rmm'][1].extend(other[k]['rmm'][1])
  142. detPage[key]['rmm'][2].extend(other[k]['rmm'][2])
  143. detPage[key]['rmm'][3].extend(other[k]['rmm'][3])
  144. detPage[key]['rmm'][4].extend(other[k]['rmm'][4])
  145. detPage[key]['rmm'][5].extend(other[k]['rmm'][5])
  146. detPage[key]['rmm'][6].extend(other[k]['rmm'][6])
  147. detPage[key]['rmm'][7].extend(other[k]['rmm'][7])
  148. detPage[key]['rmm'][8].extend(other[k]['rmm'][8])
  149. detPage[key]['files'].append(other[k]['filename'])
  150. other.pop(k)
  151. # Parsing the Listing Pages and put the tag's content into a list
  152. for index, line1 in enumerate(lines):
  153. print("Reading listing folder of '" + forum + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines)))
  154. readError = False
  155. try:
  156. html = codecs.open(line1.strip('\n'), encoding='utf8')
  157. soup = BeautifulSoup(html, "html.parser")
  158. html.close()
  159. except:
  160. try:
  161. html = open(line1.strip('\n'))
  162. soup = BeautifulSoup(html, "html.parser")
  163. html.close()
  164. except:
  165. nError += 1
  166. print("There was a problem to read the file " + line1 + " in the Listing section!")
  167. if createLog:
  168. logFile.write(str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n")
  169. readError = True
  170. if not readError:
  171. parseError = False
  172. try:
  173. if forum == "BestCardingWorld":
  174. rw = bestcardingworld_listing_parser(soup)
  175. elif forum == "CryptBB":
  176. rw = cryptBB_listing_parser(soup)
  177. elif forum == "OnniForums":
  178. rw = onniForums_listing_parser(soup)
  179. elif forum == "Altenens":
  180. rw = altenens_listing_parser(soup)
  181. elif forum == "Procrax":
  182. rw = procrax_listing_parser(soup)
  183. except:
  184. nError += 1
  185. print("There was a problem to read the file " + line1 + " in the listing section!")
  186. traceback.print_exc()
  187. if createLog:
  188. logFile.write(
  189. str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n")
  190. parseError = True
  191. if not parseError:
  192. persistError = False
  193. moveError = False
  194. num_in_db = 0
  195. num_persisted_moved = 0
  196. for rec in rw:
  197. rec = rec.split(',')
  198. # print(rec)
  199. # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
  200. key = u"Url:" + cleanLink(rec[6]) + "page1"
  201. # print(key)
  202. if key in detPage:
  203. # Combining the information from Listing and Description Pages
  204. rmm = detPage[key]['rmm']
  205. rec = mergePages(rmm, rec)
  206. # Append to the list the classification of the topic
  207. # if isRussianForum(forum):
  208. # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_russian')))
  209. # else:
  210. # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english')))
  211. rec.append(str(predict(rec[3], getPosts(rec[14]), language='sup_english')))
  212. # Persisting the information in the database
  213. try:
  214. persist_data(url, tuple(rec), cur)
  215. con.commit()
  216. except:
  217. trace = traceback.format_exc()
  218. if trace.find("already exists") == -1:
  219. nError += 1
  220. print("There was a problem to persist the file " + detPage[key]['filename'] + " in the database!")
  221. if createLog:
  222. logFile.write(
  223. str(nError) + ". There was a problem to persist the file " + detPage[key]['filename'] + " in the database.\n")
  224. persistError = True
  225. con.rollback()
  226. if not persistError:
  227. # move description files of completed folder
  228. for filename in detPage[key]['files']:
  229. source = line2.replace(os.path.basename(line2), "") + filename
  230. destination = line2.replace(os.path.basename(line2), "") + r'Read/'
  231. try:
  232. shutil.move(source, destination)
  233. num_persisted_moved += 1
  234. except:
  235. print("There was a problem to move the file " + filename + " in the Description section!")
  236. nError += 1
  237. if createLog:
  238. logFile.write(
  239. str(nError) + ". There was a problem to move the file " + filename + " in the Description section!.\n")
  240. moveError = True
  241. # if the associated description page is not read or not parsed
  242. else:
  243. # query database
  244. # if the post already exists:
  245. # num_in_db += 1
  246. pass
  247. # if number of topics on listing page is equal to
  248. # the number of merged, persisted, and moved topics plus
  249. # the number of topics already in the database
  250. if not persistError and not moveError and len(rw) == (num_persisted_moved + num_in_db):
  251. # move listing file to completed folder
  252. source = line1
  253. destination = line1.replace(os.path.basename(line1), "") + r'Read/'
  254. try:
  255. shutil.move(source, destination)
  256. except:
  257. nError += 1
  258. print("There was a problem to move the file " + line1 + " in the Listing section!")
  259. if createLog:
  260. logFile.write(str(nError) + ". There was a problem to move the file " + line1 + " in the Listing section!.\n")
  261. if createLog:
  262. logFile.close()
  263. #end = time.time()
  264. #finalTime = float(end-ini)
  265. #print (forum + " Parsing Perfomed Succesfully in %.2f" %finalTime + "!")
  266. input("Parsing the " + forum + " forum and data classification done successfully. Press ENTER to continue\n")