Skip to content

Commit 50e1f7c

Browse files
committed
bugfinding with csv_to_json code
1 parent d0d7d53 commit 50e1f7c

File tree

6 files changed

+38398
-8681
lines changed

6 files changed

+38398
-8681
lines changed

.ipynb_checkpoints/Twitter NX User Graph-checkpoint.ipynb

Lines changed: 35 additions & 64 deletions
Large diffs are not rendered by default.

Twitter NX User Graph - convert from CSV.ipynb

Lines changed: 427 additions & 0 deletions
Large diffs are not rendered by default.

Twitter NX User Graph.ipynb

Lines changed: 23 additions & 39 deletions
Large diffs are not rendered by default.

csv_to_json.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import time
2+
import os
3+
import sys
4+
import json
5+
import argparse
6+
import preprocessor as p
7+
import csv
8+
9+
RESULTS_DIR = os.path.join(os.path.curdir, "results/")
10+
DATA_DIR = "/home/chris/data"
11+
12+
# Process a csv single status
13+
def csv_to_json(filename, output_file):
14+
15+
d = {}
16+
17+
with open(os.path.join(DATA_DIR, filename), encoding="utf-8") as r:
18+
reader = csv.reader(r, delimiter=",", quotechar='"')
19+
20+
next(reader, None)
21+
for row in reader:
22+
link, date, time, author_id, followers, following, *content_rows= row
23+
24+
content = ""
25+
26+
for row in content_rows:
27+
content += f" {row}"
28+
29+
url, id_str = link.split("statuses/")
30+
31+
hashtags = []
32+
mentions = []
33+
url_strings = []
34+
35+
parsed_tweet = p.parse(content)
36+
37+
try:
38+
for hashtag in parsed_tweet.hashtags:
39+
hashtags.append(hashtag.match)
40+
except TypeError:
41+
pass
42+
43+
try:
44+
for mention in parsed_tweet.mentions:
45+
mentions.append(mention.match)
46+
except TypeError:
47+
print(f"{author_id}: {content}")
48+
49+
try:
50+
for url_string in parsed_tweet.urls:
51+
url_strings.append(url_string.match)
52+
except TypeError:
53+
pass
54+
55+
d[id_str] = {
56+
'id_str': id_str,
57+
'date': date,
58+
'text': content,
59+
60+
# To do: tokenize tweet to get retweets
61+
62+
'retweet_count': 0,
63+
'favorite_count': 0,
64+
'reply_to': 0,
65+
'coordinates': 0,
66+
'reply_to_tweet': 0,
67+
'user_screen_name': f"@{author_id}",
68+
'quoted_status': 0,
69+
'lang': 0,
70+
'entities': 0,
71+
'urls': url_strings,
72+
'hashtags': hashtags,
73+
'user_mentions': mentions,
74+
'user': author_id,
75+
}
76+
77+
with open(os.path.join(RESULTS_DIR, "converted_"+output_file+".json"), 'w') as f:
78+
f.write(json.dumps(d, indent=1))
79+
80+
print("Success")
81+
82+
if __name__ == '__main__':
83+
ap = argparse.ArgumentParser()
84+
ap.add_argument("-f", "--filename", required=True, help="Enter file name")
85+
ap.add_argument("-o", "--output", required=True, help="Enter output file name")
86+
args = vars(ap.parse_args())
87+
88+
filename = args['filename']
89+
output_file = args['output']
90+
91+
csv_to_json(filename, output_file)

0 commit comments

Comments
 (0)