Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ MONGODB_SEPARATE_COLLECTIONS = True
| --- | --- | --- | --- |
| `MONGODB_DATABASE` | scrapy-mongodb | No | Database to use. Does not need to exist. |
| `MONGODB_COLLECTION` | items | No | Collection within the database to use. Does not need to exist. |
| `MONGODB_ITEM_COLLECTION` | None | No | Collection name to use if an item has this key. Does not need to exist. |
| `MONGODB_URI` | mongodb://localhost:27017 | No | URI to the MongoDB instance or replica sets you want to connect to. It must start with `mongodb://` (see more in the [MongoDB docs][1]). E.g.: `mongodb://user:pass@host:port`, `mongodb://user:pass@host:port,host2:port2` |
| `MONGODB_UNIQUE_KEY` | None | No | If you want to have a unique key in the database, enter the key name here. `scrapy-mongodb` will ensure the key is properly indexed. |
| `MONGODB_BUFFER_DATA` | None | No | To ease the load on MongoDB, set this option to the number of items you want to buffer in the client before sending them to database. Setting a `MONGODB_UNIQUE_KEY` together with `MONGODB_BUFFER_DATA` is not supported. |
Expand Down
27 changes: 16 additions & 11 deletions scrapy_mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class MongoDBPipeline(BaseItemExporter):
'write_concern': 0,
'database': 'scrapy-mongodb',
'collection': 'items',
'item_collection': None,
'separate_collections': False,
'replica_set': None,
'unique_key': None,
Expand Down Expand Up @@ -82,7 +83,7 @@ def open_spider(self, spider):

# Set up the database
self.database = connection[self.config['database']]
self.collections = {'default': self.database[self.config['collection']]}
self.collections = {}

self.logger.info(u'Connected to MongoDB {0}, using "{1}"'.format(
self.config['uri'],
Expand Down Expand Up @@ -140,6 +141,7 @@ def configure(self):
('write_concern', 'MONGODB_REPLICA_SET_W'),
('database', 'MONGODB_DATABASE'),
('collection', 'MONGODB_COLLECTION'),
('item_collection', 'MONGODB_ITEM_COLLECTION'),
('separate_collections', 'MONGODB_SEPARATE_COLLECTIONS'),
('replica_set', 'MONGODB_REPLICA_SET'),
('unique_key', 'MONGODB_UNIQUE_KEY'),
Expand Down Expand Up @@ -219,7 +221,7 @@ def insert_item(self, item, spider):
if self.config['append_timestamp']:
item['scrapy-mongodb'] = {'ts': datetime.datetime.utcnow()}

collection_name, collection = self.get_collection(spider.name)
collection_name, collection = self.get_collection(spider.name, item)

if self.config['unique_key'] is None:
try:
Expand Down Expand Up @@ -253,17 +255,20 @@ def insert_item(self, item, spider):

return item

def get_collection(self, name):
if self.config['separate_collections']:
collection = self.collections.get(name)
def get_collection(self, name, item):

collection_name = self.config['collection']

if self.config['item_collection'] and self.config['item_collection'] in item:
collection_name = item[self.config['item_collection']]
elif self.config['separate_collections']:
collection_name = name

if not collection:
collection = self.database[name]
self.collections[name] = collection
else:
collection = self.collections.get('default')
collection_name = self.config['collection']
collection = self.collections.get(collection_name)

if not collection:
collection = self.database[collection_name]
self.collections[collection_name] = collection

# Ensure unique index
if self.config['unique_key']:
Expand Down