From ea0ad3fe9d2a06fe421f84e58f61f82340c019a8 Mon Sep 17 00:00:00 2001 From: Andrew Duthie Date: Fri, 22 Mar 2024 14:07:19 -0400 Subject: [PATCH 1/6] Change robots to deny by default with allowlist changelog: Bug Fixes, Robots, Improve consistency of robots.txt crawling directives --- public/robots.txt | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/public/robots.txt b/public/robots.txt index 18e580cf321..1a9d0e22679 100644 --- a/public/robots.txt +++ b/public/robots.txt @@ -1,5 +1,11 @@ User-agent: * -Disallow: /users/password/edit -Disallow: /sign_up/enter_password -Disallow: /sign_up/email/confirm -Disallow: /api/saml/ +Disallow: / +Allow: /$ +Allow: /es$ +Allow: /fr$ +Allow: /sign_up/enter_email$ +Allow: /es/sign_up/enter_email$ +Allow: /fr/sign_up/enter_email$ +Allow: /users/password/new$ +Allow: /es/users/password/new$ +Allow: /fr/users/password/new$ From 77e78489c872c8949abbdc14eb76dab3f0046924 Mon Sep 17 00:00:00 2001 From: Andrew Duthie Date: Fri, 22 Mar 2024 14:07:34 -0400 Subject: [PATCH 2/6] Remove redundant robots meta tag --- app/views/layouts/base.html.erb | 4 ---- 1 file changed, 4 deletions(-) diff --git a/app/views/layouts/base.html.erb b/app/views/layouts/base.html.erb index dfb2599f44e..8af746f3dad 100644 --- a/app/views/layouts/base.html.erb +++ b/app/views/layouts/base.html.erb @@ -11,10 +11,6 @@ <% end %> - <% if session_with_trust? || FeatureManagement.disallow_all_web_crawlers? %> - - <% end %> - <%= title %> | <%= APP_NAME %> <%= javascript_tag(nonce: true) do %> From e5947752bf3c40b763d0e6ea9fa553e03592afac Mon Sep 17 00:00:00 2001 From: Andrew Duthie Date: Fri, 22 Mar 2024 14:10:14 -0400 Subject: [PATCH 3/6] Update specs --- spec/controllers/sign_up/passwords_controller_spec.rb | 7 ------- spec/controllers/users/reset_passwords_controller_spec.rb | 3 +-- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/spec/controllers/sign_up/passwords_controller_spec.rb b/spec/controllers/sign_up/passwords_controller_spec.rb index 55058d0673e..4dc62ad4ea1 100644 --- a/spec/controllers/sign_up/passwords_controller_spec.rb +++ b/spec/controllers/sign_up/passwords_controller_spec.rb @@ -169,13 +169,6 @@ describe '#new' do render_views - it 'instructs crawlers to not index this page' do - token = 'foo token' - create(:user, :unconfirmed, confirmation_token: token) - get :new, params: { confirmation_token: token } - - expect(response.body).to match('') - end it 'rejects when confirmation_token is invalid' do invalid_confirmation_sent_at = diff --git a/spec/controllers/users/reset_passwords_controller_spec.rb b/spec/controllers/users/reset_passwords_controller_spec.rb index 0e9fa4ab35b..cd9e95afc9f 100644 --- a/spec/controllers/users/reset_passwords_controller_spec.rb +++ b/spec/controllers/users/reset_passwords_controller_spec.rb @@ -141,7 +141,7 @@ allow(user).to receive(:email_addresses).and_return([email_address]) end - it 'displays the form to enter a new password and disallows indexing' do + it 'displays the form to enter a new password' do expect(email_address).to receive(:email).twice forbidden = instance_double(ForbiddenPasswords) @@ -156,7 +156,6 @@ expect(response).to render_template :edit expect(flash.keys).to be_empty - expect(response.body).to match('') end end end From 3ee1ae8bc3deb7e5859d5a53baf5c64538500b7c Mon Sep 17 00:00:00 2001 From: Andrew Duthie Date: Fri, 22 Mar 2024 14:35:03 -0400 Subject: [PATCH 4/6] Dynamically generate robots.txt --- app/controllers/robots_controller.rb | 25 +++++++++++++++++++++++++ config/routes.rb | 1 + public/robots.txt | 11 ----------- 3 files changed, 26 insertions(+), 11 deletions(-) create mode 100644 app/controllers/robots_controller.rb delete mode 100644 public/robots.txt diff --git a/app/controllers/robots_controller.rb b/app/controllers/robots_controller.rb new file mode 100644 index 00000000000..55c7da0cc13 --- /dev/null +++ b/app/controllers/robots_controller.rb @@ -0,0 +1,25 @@ +class RobotsController < ApplicationController + ALLOWED_ROUTES = %i[ + new_user_session + forgot_password + sign_up_email + ].to_set.freeze + + def index + render plain: [ + 'User-agent: *', + 'Disallow: /', + *allowed_paths.map { |path| "Allow: #{path}$" }, + ].join("\n") + end + + private + + def allowed_paths + I18n.available_locales. + map { |locale| locale == I18n.default_locale ? nil : locale }. + flat_map do |locale| + ALLOWED_ROUTES.map { |route| route_for(route, only_path: true, locale:) } + end + end +end diff --git a/config/routes.rb b/config/routes.rb index 0e11ce3dcc0..6d02f4143b9 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -56,6 +56,7 @@ get '/openid_connect/logout' => 'openid_connect/logout#index' delete '/openid_connect/logout' => 'openid_connect/logout#delete' + get '/robots.txt' => 'robots#index' get '/no_js/detect.css' => 'no_js#index', as: :no_js_detect_css # i18n routes. Alphabetically sorted. diff --git a/public/robots.txt b/public/robots.txt deleted file mode 100644 index 1a9d0e22679..00000000000 --- a/public/robots.txt +++ /dev/null @@ -1,11 +0,0 @@ -User-agent: * -Disallow: / -Allow: /$ -Allow: /es$ -Allow: /fr$ -Allow: /sign_up/enter_email$ -Allow: /es/sign_up/enter_email$ -Allow: /fr/sign_up/enter_email$ -Allow: /users/password/new$ -Allow: /es/users/password/new$ -Allow: /fr/users/password/new$ From 13394eb6201b57b45bca4b2edae8e799194d4b18 Mon Sep 17 00:00:00 2001 From: Andrew Duthie Date: Mon, 25 Mar 2024 09:17:15 -0400 Subject: [PATCH 5/6] Remove unused disallow_all_web_crawlers config See: https://github.com/18F/identity-idp/pull/10292/files#r1537583288 --- config/application.yml.default | 2 -- lib/feature_management.rb | 4 ---- lib/identity_config.rb | 1 - spec/lib/feature_management_spec.rb | 14 -------------- 4 files changed, 21 deletions(-) diff --git a/config/application.yml.default b/config/application.yml.default index c681d2038ee..a58fcaa60dc 100644 --- a/config/application.yml.default +++ b/config/application.yml.default @@ -71,7 +71,6 @@ deleted_user_accounts_report_configs: '[]' development_mailer_deliver_method: letter_opener disable_email_sending: true disable_logout_get_request: true -disallow_all_web_crawlers: true disposable_email_services: '[]' doc_auth_attempt_window_in_minutes: 360 doc_capture_polling_enabled: true @@ -457,7 +456,6 @@ production: database_worker_jobs_password: '' disable_email_sending: false disable_logout_get_request: false - disallow_all_web_crawlers: false doc_auth_vendor: 'acuant' doc_auth_vendor_randomize: false doc_auth_vendor_randomize_percent: 0 diff --git a/lib/feature_management.rb b/lib/feature_management.rb index 9741cb7b37b..2dc4f82cf63 100644 --- a/lib/feature_management.rb +++ b/lib/feature_management.rb @@ -76,10 +76,6 @@ def self.enable_saml_cert_rotation? IdentityConfig.store.saml_secret_rotation_enabled end - def self.disallow_all_web_crawlers? - IdentityConfig.store.disallow_all_web_crawlers - end - def self.gpo_upload_enabled? # leaving the usps name for backwards compatibility IdentityConfig.store.usps_upload_enabled diff --git a/lib/identity_config.rb b/lib/identity_config.rb index 35e76c35f1d..56a1a39d6ca 100644 --- a/lib/identity_config.rb +++ b/lib/identity_config.rb @@ -171,7 +171,6 @@ def self.build_store(config_map) config.add(:development_mailer_deliver_method, type: :symbol, enum: [:file, :letter_opener]) config.add(:disable_email_sending, type: :boolean) config.add(:disable_logout_get_request, type: :boolean) - config.add(:disallow_all_web_crawlers, type: :boolean) config.add(:disposable_email_services, type: :json) config.add(:doc_auth_attempt_window_in_minutes, type: :integer) config.add(:doc_auth_check_failed_image_resubmission_enabled, type: :boolean) diff --git a/spec/lib/feature_management_spec.rb b/spec/lib/feature_management_spec.rb index 9eaff2be951..1542af8a2b1 100644 --- a/spec/lib/feature_management_spec.rb +++ b/spec/lib/feature_management_spec.rb @@ -270,20 +270,6 @@ end end - describe '#disallow_all_web_crawlers?' do - it 'returns true when IdentityConfig setting is true' do - allow(IdentityConfig.store).to receive(:disallow_all_web_crawlers) { true } - - expect(FeatureManagement.disallow_all_web_crawlers?).to eq(true) - end - - it 'returns false when IdentityConfig setting is false' do - allow(IdentityConfig.store).to receive(:disallow_all_web_crawlers) { false } - - expect(FeatureManagement.disallow_all_web_crawlers?).to eq(false) - end - end - describe '#identity_pki_local_dev?' do context 'when in development mode' do before(:each) do From 2b15d49b1b22422877a2bca77e5684d5f5b6cc4c Mon Sep 17 00:00:00 2001 From: Andrew Duthie Date: Mon, 25 Mar 2024 09:22:11 -0400 Subject: [PATCH 6/6] Add specs for RobotsController --- spec/controllers/robots_controller_spec.rb | 28 ++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 spec/controllers/robots_controller_spec.rb diff --git a/spec/controllers/robots_controller_spec.rb b/spec/controllers/robots_controller_spec.rb new file mode 100644 index 00000000000..fd899d9614f --- /dev/null +++ b/spec/controllers/robots_controller_spec.rb @@ -0,0 +1,28 @@ +require 'rails_helper' + +RSpec.describe RobotsController do + describe '#index' do + subject(:response) { get :index } + let(:lines) { response.body.lines(chomp: true) } + + it 'renders plaintext' do + expect(response.content_type.split(';').first).to eq('text/plain') + end + + it 'targets all crawlers' do + expect(lines).to include('User-agent: *') + end + + it 'denies all by default' do + expect(lines).to include('Disallow: /') + end + + it 'allows public routes' do + expect(lines).to include('Allow: /$') + end + + it 'allows localized version of public routes' do + expect(lines).to include('Allow: /es$') + end + end +end