"""reddit_data_preprocessing_for_upload.py.""" """ This program is a 1 year search of the Reddit source files downloaded from pushshift.io. author = 'Raymond Steding' copyright = 'copyright 2019, The WE1S Project' license = 'MIT' version = '1.0' email = 'raymond.steding.117@my.csun.edu' """ # Python Imports import glob import json import os import re import sys import os import calendar import datetime from dateutil.parser import parse from textblob import TextBlob # Configuration: This example uses the year 2013. month = 1 source_dir = '/Reddit/' input_dir = '/Reddit/input/' output_dir = '/Reddit/output/' UCSB_meta_file = '/Reddit/metadata/metadata' word_limit = 225 # Minimum number of words in the comment score_limit = 3 # Minimum Karma value of commenter while month <= 12: if month <= 9: grep_command = 'grep -i \'humanities\\|liberal arts\\|the arts\' /Reddit/RC_2013-0' + str(month) + ' > /Reddit/input/2013-0' + str(month) + '-humanities' os.system(grep_command) else: grep_command = 'grep -i \'humanities\\|liberal arts\\|the arts\' /Reddit/RC_2013-' + str(month) + ' > /Reddit/input/2013-' + str(month) + '-humanities' os.system(grep_command) month += 1 # Add a bracket at the beginning and end of a line and a comma # after each line, except the last, in order to create json files # out of the files returned by the grep command above. os.system('for file in ' + input_dir + '/*; do sed \'1s/^/[/;$!s/$/,/;$s/$/]/\' "$file" >> "$file.json"; done') # Main Topic Modeling Routine for preprocessing Reddit data for json_filename in glob.glob(os.path.join(input_dir, '*.json')): # Print the filenames as output to audit program execution print(json_filename) # Initialize the file counter to be appended to the filenames file_cnt = 0 # Open the metadata file to append each comment to UCSB_meta_file_out = open(UCSB_meta_file, 'a', encoding='UTF-8') # Process every input file named json_filename with open(json_filename, 'r') as f: # Set num_lines to the total number of lines for the while loop num_lines = sum(1 for line in open(json_filename)) # Read the line doc = json.loads(f.read()) # Re-initializes the while loop counter count = 0 # Re-initialize the sentiment total sent_total = 0.000 # Process each line of the input file as a comment while count < num_lines: # Set the metadata variables subreddit = doc[count]['subreddit'] content = doc[count]['body'] score = str(doc[count]['score']) author = doc[count]['author'] parent_id = doc[count]['parent_id'] Reddit_id = doc[count]['id'] subreddit_id = doc[count]['subreddit_id'] commenter = author try: permalink = str(doc[count]['permalink']) except KeyError: permalink = 'NA' threadlink = 'NA' if permalink != 'NA': permalink = 'http://reddit.com' + permalink threadlink = permalink.rsplit('/', 2)[0] try: upvotes = str(doc[count]['ups']) except KeyError: upvotes = '0' try: downvotes = str(doc[count]['downs']) except KeyError: downvotes = '0' # Clean some of the known content problems for JSON formatting/processing content = content.replace('‘','\'') ; content = content.replace('>','') content = content.replace('’','\'') # Calculate the sentiment polarity blob = TextBlob(content) sent_total == 0.000 sent_cnt = 0 for sentence in blob.sentences: sent = ("%.3f" % round(sentence.sentiment.polarity ,3)) sent = float(sent) sent_total = sent_total + sent sent_cnt += 1 sent = sent_total / sent_cnt sent = round(sent, 3) SP = sent sent = 0.000 sent_total = 0.000 # Calculate the subjectivity value sent_total == 0.000 sent_cnt = 0 for sentence in blob.sentences: sent = ("%.3f" % round(sentence.sentiment.subjectivity ,3)) sent = float(sent) sent_total = sent_total + sent sent_cnt += 1 sent = sent_total / sent_cnt sent = round(sent, 3) SS = sent sent = 0.000 sent_total = 0.000 # The code below sets the file_title variable filename2 = f.name filename2 = filename2.replace('.json','') filename2 = filename2 + '_' + str(file_cnt) + '.txt' file_title = (os.path.basename(filename2)) # Set the data_outfile path to the text files for upload as myzip.zip data_outfile = output_dir + file_title file_title = file_title.replace('\n','') # Capture the word length as str word_lengths = len(content.split()) words = word_lengths word_lengths = str(word_lengths) # Set the date to the proper format for the metadata file date = doc[count]['created_utc'] try: date = datetime.datetime.fromtimestamp(date).strftime('%c') except TypeError: date = int(date) date = datetime.datetime.fromtimestamp(date).strftime('%c') date = parse(date) date = date.strftime('%Y-%m-%d') # Create the metadata for the comment UCSB_meta_data = '{' + '\"filename\"' + ':' + '\"' + file_title + '\"' + ',' \ + '\"title\"' + ':' + '\"' + file_title + '\"' + ',' \ + '\"author\"' + ':' + '\"' + author + '\"' + ',' \ + '\"pub_date\"' + ':' + '\"' + date + '\"' + ',' \ + '\"pub\"' + ':' + '\"' + subreddit + '\"' + ',' \ + '\"Sentiment\"' + ':' + '\"' + str(SP) + '\"' + ',' \ + '\"Subjectivity\"' + ':' + '\"' + str(SS) + '\"' + ',' \ + '\"Score\"' + ':' + '\"' + score + '\"' + ',' \ + '\"Upvotes\"' + ':' + '\"' + upvotes + '\"' + ',' \ + '\"Downvotes\"' + ':' + '\"' + downvotes + '\"' + ',' \ + '\"Wordcount\"' + ':' + '\"' + word_lengths + '\"' + ',' \ + '\"Permalink\"' + ':' + '\"' + permalink + '\"' + ',' \ + '\"Threadlink\"' + ':' + '\"' + threadlink + '\"' + ',' \ + '\"Parent_id\"' + ':' + '\"' + parent_id + '\"' + ',' \ + '\"Reddit_id\"' + ':' + '\"' + Reddit_id + '\"' + ',' \ + '\"Subreddit_id\"' + ':' + '\"' + subreddit_id + '\"' + ',' \ + '\"Commenter\"' + ':' + '\"' + commenter + '\"' + '}' + '\n' # Set the Karma value to integer try: int(score) except ValueError: score = '0' score = int(score) # Set filters for minimum number of words and scores if words >= word_limit and score >= score_limit: # Open a text file and write the comment to it outfile = open(data_outfile, 'w', encoding='UTF-8') outfile.write(content) # Write the metadata file UCSB_meta_file_out.write(UCSB_meta_data) # Increment the counters count += 1 file_cnt += 1 #Close the open files outfile.close() UCSB_meta_file_out.close() # Add the commas and brackets to the metadata json list file os_command = 'for file in ' + UCSB_meta_file + '; do sed \'1s/^/[/;$!s/$/,/;$s/$/]/\' "$file" >> "$file.json"; done' os.system(os_command) os.remove(UCSB_meta_file)