import pandas as pd
import requests
import tweepy
import os
import json
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', -1) # Displays the full text in a column instead of truncating it
archive = pd.read_csv('twitter-archive-enhanced.csv')
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
r = requests.get(url)
with open('image_predictions.tsv', 'wb') as file:
file.write(r.content)
predictions = pd.read_csv('image_predictions.tsv', sep='\t')
consumer_key = 'YOUR_CONSUMER_KEY_HERE'
consumer_secret = 'YOUR_CONSUMER_SECRET_HERE'
access_token = 'YOUR_ACCESS_TOKEN_HERE'
access_secret = 'YOUR_ACCESS_SECRET_HERE'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
#with open('tweet_json.txt', 'w') as file:
# for tweet in archive.tweet_id:
# try:
# jsonstuff = api.get_status(tweet, tweet_mode='extended')._json
# json.dump(jsonstuff, file)
# file.write('\n')
# print(tweet)
# except tweepy.TweepError as err:
# print(err.api_code)
wanted_keys = ['id', 'favorite_count', 'favorited', 'is_quote_status', 'retweet_count', 'retweeted']
tweet_json_list = []
with open('tweet_json.txt', 'r') as file:
for line in file:
json_line = json.loads(line)
selected_keys = {k:v for k, v in json_line.items() if k in wanted_keys}
tweet_json_list.append(selected_keys)
extended = pd.DataFrame(tweet_json_list)
archive.head()
archive.tail()
predictions.head()
predictions.tail()
extended.head()
extended.tail()
archive.info()
archive.describe()
Many many null values in previously-mentioned columns; some nulls in expanded_urls
Invalid data types: timestamp & retweeted_status_timestamp
Invalid data: the project brief specified only original tweets, so the 78 replies and 181 retweets (and any others) are invalid for this project.
0 values in rating_numerator and rating_denominator
predictions.info()
predictions.describe()
extended.info()
extended.describe()
archive.name.value_counts()
archive[archive['rating_denominator'] != 10]
predictions[predictions['jpg_url'].duplicated()]
predictions[predictions['jpg_url'] == 'https://pbs.twimg.com/media/DA7iHL5U0AA1OQo.jpg']
extended[extended['favorite_count'] == 0]
extended[extended['retweet_count'] == 0]
archive_clean = archive.copy()
predictions_clean = predictions.copy()
extended_clean = extended.copy()
full_archive = pd.merge(archive_clean, extended_clean, how='left', left_on='tweet_id', right_on='id').drop('id', axis=1)
full_archive.head()
full_archive.timestamp = pd.to_datetime(full_archive.timestamp, errors='coerce', infer_datetime_format=True)
full_archive.retweeted_status_timestamp = pd.to_datetime(full_archive.retweeted_status_timestamp, errors='coerce', infer_datetime_format=True)
full_archive.favorite_count = full_archive.favorite_count.fillna(0).astype(int)
full_archive.retweet_count = full_archive.retweet_count.fillna(0).astype(int)
full_archive.favorited = full_archive.favorited.astype(bool)
full_archive.retweeted = full_archive.retweeted.astype(bool)
full_archive.is_quote_status = full_archive.is_quote_status.astype(bool)
full_archive.info()
full_archive = full_archive[full_archive['in_reply_to_status_id'].isnull()]
full_archive = full_archive[full_archive['retweeted'] == False]
full_archive = full_archive[full_archive['is_quote_status'] == False]
full_archive = full_archive[full_archive['favorited'] == False]
full_archive = full_archive[full_archive['retweeted_status_id'].isnull()] # 'Retweeted' missed a few
full_archive.info()
full_archive = full_archive.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id',
'retweeted_status_user_id', 'retweeted_status_timestamp', 'favorited',
'is_quote_status', 'retweeted'], axis=1)
full_archive.info()
full_archive[full_archive['favorite_count'] == 0]
full_archive[full_archive['retweet_count'] == 0] # It turns out these were taken care of by deleting retweets/replies!
full_archive[full_archive['rating_denominator'] == 0] # This must have also been taken care of by the replies/retweets purge.
full_archive[full_archive['rating_numerator'] == 0] # This one turns out to be a joke and can be deleted as well.
full_archive = full_archive.drop(315, axis=0)
full_archive[full_archive['rating_numerator'] == 0]
full_archive[full_archive['rating_denominator'] != 10]
full_archive.loc[1068, ['rating_numerator', 'rating_denominator']] = 14, 10
full_archive.loc[1165, ['rating_numerator', 'rating_denominator']] = 13, 10
full_archive.loc[1202, ['rating_numerator', 'rating_denominator']] = 11, 10
full_archive.loc[1662, ['rating_numerator', 'rating_denominator']] = 10, 10
full_archive.loc[2335, ['rating_numerator', 'rating_denominator']] = 9, 10
full_archive[full_archive['rating_denominator'] != 10]
def name_replace(df):
'''Takes a dataframe. Finds dogs with words ('a', 'quite') as names, who are named in the text,
and corrects the name field accordingly. Returns the corrected dataframe.'''
index = df[(df['name'].str.islower()) & (df['text'].str.contains('named'))].index.values
for i in index:
new_name = df.loc[i, 'text'].split('named ')[1].split(' ')[0]
if new_name.endswith('.'):
new_name = new_name[:-1]
df.loc[i, 'name'] = new_name
return df
full_archive = name_replace(full_archive)
full_archive[(full_archive['name'].str.islower()) & (full_archive['text'].str.contains('name'))] # Two orphan cases to fix manually; the rest can be changed to None.
full_archive.loc[852, 'name'] = 'Zoey'
full_archive.loc[2287, 'name'] = 'Daryl'
full_archive.loc[full_archive['name'].str.islower(), 'name'] = None
full_archive.name.value_counts()
predictions_clean['p1'] = predictions_clean['p1'].str.lower()
predictions_clean['p2'] = predictions_clean['p2'].str.lower()
predictions_clean['p3'] = predictions_clean['p3'].str.lower()
predictions_clean.head()
urls_only = full_archive[['tweet_id', 'expanded_urls']]
predictions_clean = pd.merge(predictions_clean, urls_only, how='left', left_on='tweet_id', right_on='tweet_id')
predictions_w_url = predictions_clean[predictions_clean['expanded_urls'].notnull()]
predictions_w_url[predictions_w_url['jpg_url'].duplicated()]
predictions_w_url.info()
predictions_clean[~predictions_clean['jpg_url'].duplicated()].sample(25) # Trying to figure out what the non-duplicate pictures with NaN URLs are. A lot appear to not be from dogrates.
predictions_clean = predictions_w_url.copy().drop('expanded_urls', axis=1)
predictions_clean[predictions_clean['jpg_url'].duplicated()]
tweets = full_archive[['tweet_id', 'timestamp', 'text', 'expanded_urls', 'favorite_count', 'retweet_count']]
dogs = full_archive[['tweet_id', 'name', 'rating_numerator', 'rating_denominator', 'doggo', 'floofer', 'pupper', 'puppo']]
tweets.head()
dogs.head()
def stage_sum(df):
'''
Takes in a dataframe, turns all string-literal 'None's to 0s, creates a new dummy column called 'multiple',
then calculates the sum of all four stage columns to assign values to 'multiple' based on their boolean sum.
Returns the new dataframe. Please feel free to suggest a better fix than this. I can't think of one that doesn't
require some kind of None or Boolean, and replace() doesn't actually work with the null None.
'''
df = df.replace('None', 0)
df['multiple'] = 0
for i in df.index:
doggo = bool(df.loc[i, 'doggo'])
floofer = bool(df.loc[i, 'floofer'])
pupper = bool(df.loc[i, 'pupper'])
puppo = bool(df.loc[i, 'puppo'])
stage_sum = doggo + floofer + pupper + puppo
if stage_sum > 1:
df.loc[i, 'multiple'] = 'multiple'
df = df.replace(0, 'None') # If anyone has advice on how to avoid this double replacement call in particular, let me know!
return df
dogs = stage_sum(dogs)
dogs = pd.melt(dogs, id_vars = ['tweet_id', 'name', 'rating_numerator', 'rating_denominator'], var_name="variable", value_name="stage").drop('variable', axis=1)
dogs['stage'] = dogs['stage'].astype('category')
dogs['stage'] = dogs['stage'].cat.set_categories(['None', 'doggo', 'floofer', 'pupper', 'puppo', 'multiple'], ordered=True)
dogs = dogs.sort_values(by='stage')
dogs = dogs.drop_duplicates(subset='tweet_id', keep='last') # Needed to prevent each tweet/dog having 2-4 redundant rows!
dogs.stage.value_counts()
tweets.info()
dogs.info()
predictions_clean.info()
tweets.to_csv('twitter_archive_master.csv')
dogs.to_csv('dogs_archive_master.csv')
predictions_clean.to_csv('predictions_archive_master.csv')
predictions_clean.describe() # The highest confidence levels are mostly found in p1, so I'll focus on that.
predictions_clean.p1.value_counts()
predictions_clean[predictions_clean['p1_conf'] > 0.75].p1.value_counts()
137/1969 # Ratio of golden retrievers to all items in the full p1 column
78/690 # Ratio of golden retrievers to all items where p1_conf > .75 -- definitely higher
94/1969 # Ratio of labrador retrievers to all items in the full p1 column
40/690 # Ratio of labrador retrievers to all items where p1_conf > .75 -- slightly higher
88/1969 # Ratio of pembrokes to all items in the full p1 column
47/690 # Ratio of pembrokes to all items where p1_conf > .75 -- also slightly higher
# I also want to know about the ratio of non-dogs to dogs at high confidence levels:
predictions_clean.p1_dog.value_counts()
predictions_clean[predictions_clean['p1_conf'] > 0.75].p1_dog.value_counts()
507/1969 # Ratio of non-dogs to all items in the full p1 column
152/690 # Ratio of non-dogs to all items where p1_conf > .75 -- I thought it would be higher!
What can I take away from this? Assuming confidence as a proxy for accuracy, it seems as though the neural network is especially good at predicting golden retrievers, fairly good at other common breeds, and not as good as I expected at picking out things that aren't dogs.
dogs.groupby('stage').rating_numerator.mean()
This one-line function tells us that of the dogs who have "stages," puppos tend to rate the highest -- while, oddly enough, puppers rate the lowest!
tweets.head()
tweets_by_time = tweets[['timestamp', 'favorite_count', 'retweet_count']]
tweets_by_time = tweets_by_time.set_index('timestamp')
tweets_by_time.head()
tweets_by_year = tweets_by_time.resample('A').median()
tweets_by_year
tweets_by_quarter = tweets_by_time.resample('Q').median()
tweets_by_quarter
tweets_by_month = tweets_by_time.resample('MS').median()
tweets_by_month
tweets_by_week = tweets_by_time.resample('W').median()
tweets_by_week
tweets_by_day = tweets_by_time.resample('D').median()
tweets_by_day
The above code shows the median values of favorite_count and retweet_count by year, quarter, month, week, and day. With this, we can see how these counts build up over time. The day-to-day counts are inconsistent, but start to show more consistent growth aggregated by week. Month, quarter, and year all show very consistent growth.
We can also see that favorites don't just happen more than retweets, their rate also grows faster. For example, looking at the quarterly aggregate, there are just under 3 times as many favorites as retweets in the first quarter, but over 5 times as many in the last quarter. We can visualize this data below:
tweets_by_year.plot();
tweets_by_quarter.plot();
tweets_by_month.plot();
tweets_by_week.plot();
tweets_by_day.plot();
Our tweets_by_week plot is (arguably) probably the most useful of these, so I'll use this one and pretty it up a bit.
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
plt.plot(tweets_by_week.index, tweets_by_week.favorite_count, linewidth=2.0, label='Favorites');
plt.plot(tweets_by_week.index, tweets_by_week.retweet_count, linewidth=2.0, label='Retweets');
ax.spines['top'].set_visible(False); # Getting rid of top spine
ax.spines['right'].set_visible(False); # Getting rid of right spine
ax.set_xlim('2015-11-15', '2017-08-15'); # Getting rid of whitespace
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tick_params(axis='both', which='both', bottom=False, top=False,
labelbottom=True, left=False, right=False, labelleft=True); # Hiding unnecessary tick marks
plt.ylabel('Tweets', fontsize=16);
plt.title('Favorites and Retweets by Week', fontsize=22);
plt.legend();
plt.grid(True, alpha=0.25); # Showing a light gray grid to help visual alignment
plt.savefig('tweets_by_week.png', bbox_inches='tight')
plt.show()