From 5a3cffcc8d0eeae0405d554d557b72c92149fddc Mon Sep 17 00:00:00 2001 From: RaInG0ld <20622378+rockrockyy@users.noreply.github.com> Date: Sat, 30 Jun 2018 12:30:31 +0800 Subject: [PATCH 1/4] call get_collection with item --- scrapy_mongodb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_mongodb.py b/scrapy_mongodb.py index 9c9a80a..5b4b534 100644 --- a/scrapy_mongodb.py +++ b/scrapy_mongodb.py @@ -219,7 +219,7 @@ def insert_item(self, item, spider): if self.config['append_timestamp']: item['scrapy-mongodb'] = {'ts': datetime.datetime.utcnow()} - collection_name, collection = self.get_collection(spider.name) + collection_name, collection = self.get_collection(spider.name, item) if self.config['unique_key'] is None: try: @@ -253,7 +253,7 @@ def insert_item(self, item, spider): return item - def get_collection(self, name): + def get_collection(self, name, item): if self.config['separate_collections']: collection = self.collections.get(name) collection_name = name From 81012d7e5dad915ccce586628372d7bc47b5597b Mon Sep 17 00:00:00 2001 From: Patrick Yi Date: Fri, 6 Jul 2018 15:17:25 +0000 Subject: [PATCH 2/4] allow item to override collection name --- README.md | 1 + scrapy_mongodb.py | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 7e422bd..61b667e 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ MONGODB_SEPARATE_COLLECTIONS = True | --- | --- | --- | --- | | `MONGODB_DATABASE` | scrapy-mongodb | No | Database to use. Does not need to exist. | | `MONGODB_COLLECTION` | items | No | Collection within the database to use. Does not need to exist. | +| `MONGODB_ITEM_COLLECTION` | collection | No | Collection name to use if an item has this key. Does not need to exist. | | `MONGODB_URI` | mongodb://localhost:27017 | No | URI to the MongoDB instance or replica sets you want to connect to. It must start with `mongodb://` (see more in the [MongoDB docs][1]). E.g.: `mongodb://user:pass@host:port`, `mongodb://user:pass@host:port,host2:port2` | | `MONGODB_UNIQUE_KEY` | None | No | If you want to have a unique key in the database, enter the key name here. `scrapy-mongodb` will ensure the key is properly indexed. | | `MONGODB_BUFFER_DATA` | None | No | To ease the load on MongoDB, set this option to the number of items you want to buffer in the client before sending them to database. Setting a `MONGODB_UNIQUE_KEY` together with `MONGODB_BUFFER_DATA` is not supported. | diff --git a/scrapy_mongodb.py b/scrapy_mongodb.py index 5b4b534..c4323fa 100644 --- a/scrapy_mongodb.py +++ b/scrapy_mongodb.py @@ -32,6 +32,7 @@ class MongoDBPipeline(BaseItemExporter): 'write_concern': 0, 'database': 'scrapy-mongodb', 'collection': 'items', + 'item_collection': 'collection', 'separate_collections': False, 'replica_set': None, 'unique_key': None, @@ -82,7 +83,7 @@ def open_spider(self, spider): # Set up the database self.database = connection[self.config['database']] - self.collections = {'default': self.database[self.config['collection']]} + self.collections = {} self.logger.info(u'Connected to MongoDB {0}, using "{1}"'.format( self.config['uri'], @@ -140,6 +141,7 @@ def configure(self): ('write_concern', 'MONGODB_REPLICA_SET_W'), ('database', 'MONGODB_DATABASE'), ('collection', 'MONGODB_COLLECTION'), + ('item_collection', 'MONGODB_ITEM_COLLECTION'), ('separate_collections', 'MONGODB_SEPARATE_COLLECTIONS'), ('replica_set', 'MONGODB_REPLICA_SET'), ('unique_key', 'MONGODB_UNIQUE_KEY'), @@ -254,16 +256,19 @@ def insert_item(self, item, spider): return item def get_collection(self, name, item): - if self.config['separate_collections']: - collection = self.collections.get(name) + + collection_name = self.config['collection'] + + if self.config['item_collection'] and self.config['item_collection'] in item: + collection_name = item[self.config['item_collection']] + elif self.config['separate_collections']: collection_name = name - if not collection: - collection = self.database[name] - self.collections[name] = collection - else: - collection = self.collections.get('default') - collection_name = self.config['collection'] + collection = self.collections.get(collection_name) + + if not collection: + collection = self.database[collection_name] + self.collections[collection_name] = collection # Ensure unique index if self.config['unique_key']: From 508c76d14aa14935c8fb1fc2e281c45e47581c61 Mon Sep 17 00:00:00 2001 From: Patrick Yi Date: Fri, 6 Jul 2018 16:06:20 +0000 Subject: [PATCH 3/4] set None as the default value for item_collection --- scrapy_mongodb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_mongodb.py b/scrapy_mongodb.py index c4323fa..7de69e8 100644 --- a/scrapy_mongodb.py +++ b/scrapy_mongodb.py @@ -32,7 +32,7 @@ class MongoDBPipeline(BaseItemExporter): 'write_concern': 0, 'database': 'scrapy-mongodb', 'collection': 'items', - 'item_collection': 'collection', + 'item_collection': None, 'separate_collections': False, 'replica_set': None, 'unique_key': None, From 5d068db303b83c7eb8a070830fd36c8ee8678d37 Mon Sep 17 00:00:00 2001 From: Patrick Yi Date: Fri, 6 Jul 2018 16:08:19 +0000 Subject: [PATCH 4/4] update default value of item_collection in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 61b667e..1735316 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ MONGODB_SEPARATE_COLLECTIONS = True | --- | --- | --- | --- | | `MONGODB_DATABASE` | scrapy-mongodb | No | Database to use. Does not need to exist. | | `MONGODB_COLLECTION` | items | No | Collection within the database to use. Does not need to exist. | -| `MONGODB_ITEM_COLLECTION` | collection | No | Collection name to use if an item has this key. Does not need to exist. | +| `MONGODB_ITEM_COLLECTION` | None | No | Collection name to use if an item has this key. Does not need to exist. | | `MONGODB_URI` | mongodb://localhost:27017 | No | URI to the MongoDB instance or replica sets you want to connect to. It must start with `mongodb://` (see more in the [MongoDB docs][1]). E.g.: `mongodb://user:pass@host:port`, `mongodb://user:pass@host:port,host2:port2` | | `MONGODB_UNIQUE_KEY` | None | No | If you want to have a unique key in the database, enter the key name here. `scrapy-mongodb` will ensure the key is properly indexed. | | `MONGODB_BUFFER_DATA` | None | No | To ease the load on MongoDB, set this option to the number of items you want to buffer in the client before sending them to database. Setting a `MONGODB_UNIQUE_KEY` together with `MONGODB_BUFFER_DATA` is not supported. |