"""reddit_data_preprocessing_for_upload.py."""

"""
This program is a 1 year search of the Reddit
source files downloaded from pushshift.io.

author = 'Raymond Steding'
copyright = 'copyright 2019, The WE1S Project'
license = 'MIT'
version = '1.0'
email = 'raymond.steding.117@my.csun.edu'
"""

# Python Imports
import glob
import json
import os
import re
import sys
import os
import calendar
import datetime
from dateutil.parser import parse
from textblob import TextBlob

# Configuration: This example uses the year 2013.
month          = 1
source_dir     = '/Reddit/'
input_dir      = '/Reddit/input/'
output_dir     = '/Reddit/output/'
UCSB_meta_file = '/Reddit/metadata/metadata'
word_limit     = 225 # Minimum number of words in the comment
score_limit    = 3 # Minimum Karma value of commenter

while month <= 12:
    if month <= 9:
        grep_command = 'grep -i \'humanities\\|liberal arts\\|the arts\' /Reddit/RC_2013-0' + str(month) + ' > /Reddit/input/2013-0' + str(month) + '-humanities'
        os.system(grep_command)
    else:    
        grep_command = 'grep -i \'humanities\\|liberal arts\\|the arts\' /Reddit/RC_2013-' + str(month) + ' > /Reddit/input/2013-' + str(month) + '-humanities'
        os.system(grep_command)
    month += 1

# Add a bracket at the beginning and end of a line and a comma
# after each line, except the last, in order to create json files
# out of the files returned by the grep command above.
os.system('for file in ' + input_dir + '/*; do sed \'1s/^/[/;$!s/$/,/;$s/$/]/\' "$file" >> "$file.json"; done')

# Main Topic Modeling Routine for preprocessing Reddit data
for json_filename in glob.glob(os.path.join(input_dir, '*.json')):    
    # Print the filenames as output to audit program execution
    print(json_filename)

    # Initialize the file counter to be appended to the filenames
    file_cnt = 0

    # Open the metadata file to append each comment to
    UCSB_meta_file_out = open(UCSB_meta_file, 'a', encoding='UTF-8')

    # Process every input file named json_filename
    with open(json_filename, 'r') as f: 
        # Set num_lines to the total number of lines for the while loop
        num_lines = sum(1 for line in open(json_filename))
        # Read the line
        doc = json.loads(f.read())
        # Re-initializes the while loop counter
        count = 0
        # Re-initialize the sentiment total
        sent_total = 0.000
        # Process each line of the input file as a comment
        while count < num_lines:               
            # Set the metadata variables
            subreddit = doc[count]['subreddit']
            content = doc[count]['body']
            score = str(doc[count]['score'])
            author = doc[count]['author']
            parent_id = doc[count]['parent_id']
            Reddit_id = doc[count]['id']
            subreddit_id = doc[count]['subreddit_id']
            commenter = author
            try:
                permalink = str(doc[count]['permalink'])
            except KeyError:
                permalink = 'NA'
                threadlink = 'NA'
            if permalink != 'NA':
                permalink = 'http://reddit.com' + permalink
                threadlink = permalink.rsplit('/', 2)[0]              
            try:
                upvotes = str(doc[count]['ups'])
            except KeyError:
                upvotes = '0'
            try:
                downvotes = str(doc[count]['downs'])
            except KeyError:
                downvotes = '0'

            # Clean some of the known content problems for JSON formatting/processing
            content = content.replace('‘','\'') ; content = content.replace('&gt;','')
            content = content.replace('’','\'')
           
            # Calculate the sentiment polarity
            blob = TextBlob(content)
            sent_total == 0.000
            sent_cnt = 0
            for sentence in blob.sentences:           
                sent = ("%.3f" % round(sentence.sentiment.polarity ,3))
                sent = float(sent)
                sent_total = sent_total + sent
                sent_cnt += 1
            sent = sent_total / sent_cnt
            sent = round(sent, 3)
            SP = sent
            sent = 0.000
            sent_total = 0.000
            
            # Calculate the subjectivity value
            sent_total == 0.000
            sent_cnt = 0
            for sentence in blob.sentences:           
                sent = ("%.3f" % round(sentence.sentiment.subjectivity ,3))
                sent = float(sent)
                sent_total = sent_total + sent
                sent_cnt += 1
            sent = sent_total / sent_cnt
            sent = round(sent, 3)
            SS = sent
            sent = 0.000
            sent_total = 0.000

            # The code below sets the file_title variable
            filename2 = f.name
            filename2 = filename2.replace('.json','')
            filename2 = filename2 + '_' + str(file_cnt) + '.txt'
            file_title = (os.path.basename(filename2))

            # Set the data_outfile path to the text files for upload as myzip.zip
            data_outfile = output_dir + file_title
            file_title = file_title.replace('\n','')

            # Capture the word length as str
            word_lengths = len(content.split())
            words = word_lengths
            word_lengths = str(word_lengths)

            # Set the date to the proper format for the metadata file
            date = doc[count]['created_utc']
            try:
                date = datetime.datetime.fromtimestamp(date).strftime('%c')
            except TypeError:
                    date = int(date)
                    date = datetime.datetime.fromtimestamp(date).strftime('%c')
            date = parse(date)
            date = date.strftime('%Y-%m-%d')

            # Create the metadata for the comment
            UCSB_meta_data = '{' + '\"filename\"' + ':' + '\"' + file_title + '\"' + ',' \
                            + '\"title\"' + ':' + '\"' + file_title + '\"' + ',' \
                            + '\"author\"' + ':' + '\"' + author + '\"' + ',' \
                            + '\"pub_date\"' + ':' + '\"' + date + '\"' + ',' \
                            + '\"pub\"' + ':' + '\"' + subreddit + '\"' + ',' \
                            + '\"Sentiment\"' + ':' + '\"' + str(SP) + '\"' + ',' \
                            + '\"Subjectivity\"' + ':' + '\"' + str(SS) + '\"' + ',' \
                            + '\"Score\"' + ':' + '\"' + score + '\"' + ',' \
                            + '\"Upvotes\"' + ':' + '\"' + upvotes + '\"' + ',' \
                            + '\"Downvotes\"' + ':' + '\"' + downvotes + '\"' + ',' \
                            + '\"Wordcount\"' + ':' + '\"' + word_lengths + '\"' + ',' \
                            + '\"Permalink\"' + ':' + '\"' + permalink + '\"' + ',' \
                            + '\"Threadlink\"' + ':' + '\"' + threadlink + '\"' + ',' \
                            + '\"Parent_id\"' + ':' + '\"' + parent_id + '\"' + ',' \
                            + '\"Reddit_id\"' + ':' + '\"' + Reddit_id + '\"' + ',' \
                            + '\"Subreddit_id\"' + ':' + '\"' + subreddit_id + '\"' + ',' \
                            + '\"Commenter\"' + ':' + '\"' + commenter + '\"' + '}' + '\n'

            # Set the Karma value to integer
            try:
                int(score)
            except ValueError:
                score = '0'
            score = int(score)

            # Set filters for minimum number of words and scores
            if words >= word_limit and score >= score_limit:
                # Open a text file and write the comment to it
                outfile = open(data_outfile, 'w', encoding='UTF-8')
                outfile.write(content)
                # Write the metadata file
                UCSB_meta_file_out.write(UCSB_meta_data)
            
            # Increment the counters
            count += 1
            file_cnt += 1

    #Close the open files
    outfile.close()
    UCSB_meta_file_out.close()

# Add the commas and brackets to the metadata json list file
os_command = 'for file in ' + UCSB_meta_file + '; do sed \'1s/^/[/;$!s/$/,/;$s/$/]/\' "$file" >> "$file.json"; done'
os.system(os_command)
os.remove(UCSB_meta_file)