-
Notifications
You must be signed in to change notification settings - Fork 1
/
mongo_ptt.py
33 lines (23 loc) · 1.1 KB
/
mongo_ptt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# -*- coding: utf-8 -*-
# this file is used for accessing the PTT corpus under 132 server
import pymongo
from pymongo import MongoClient
import urllib, re, json
password = urllib.parse.quote_plus('gjoKClmg8eQDF4pKeVXMkTnX7wL/9MVilkavArDouNA=')
client = MongoClient('mongodb://achiii:' + password + '@140.112.147.132')
# connect to PTT corpus
db = client['PTT']
# ----- list out all the board names in PTT corpus ----- #
# ['AllTogether', 'Baseball', 'Boy-Girl'...]
board_list = db.collection_names()
# ----- access a board ----- #
collect = db['Baseball']
# ----- list out only the first 10 posts in each board ----- #
for post in collect.find()[:10]:
print (post) # every post is a dictionary
# ----- the keys within the first post ----- #
# ['push_num', 'URL', 'boo_num', 'content_seg', 'comments', 'arrow_num', 'post_time', 'author', '_id', 'content', 'comment_differenceValue', 'title']
print(collect.find()[0].keys())
# ----- accessing the element of the first post ----- #
print(collect.find()[0]['content']) # the content of the post
print(collect.find()[0]['post_time']) # the posting time of the post