diff --git a/app/controllers/api/v1/snapshots_controller.rb b/app/controllers/api/v1/snapshots_controller.rb index 5f51a263b..2da9a6813 100644 --- a/app/controllers/api/v1/snapshots_controller.rb +++ b/app/controllers/api/v1/snapshots_controller.rb @@ -30,6 +30,7 @@ def index def show @shallow = params[:shallow] || false + @with_docs = true respond_with @snapshot end @@ -41,8 +42,9 @@ def create @snapshot.scorer = @case.scorer @snapshot.try = @case.tries.first + puts "Okay, checking snapshot queries: #{@snapshot.snapshot_queries.length}" if @snapshot.save - + puts "Okay2, checking snapshot queries: #{@snapshot.snapshot_queries.length}" serialized_data = Marshal.dump(snapshot_params) puts "[SnapshotController] the size of the serialized data is #{number_to_human_size(serialized_data.bytesize)}" diff --git a/app/jobs/populate_snapshot_job.rb b/app/jobs/populate_snapshot_job.rb index 643b6f54b..6f5187d88 100644 --- a/app/jobs/populate_snapshot_job.rb +++ b/app/jobs/populate_snapshot_job.rb @@ -13,17 +13,16 @@ def perform snapshot serialized_data = Zlib::Inflate.inflate(compressed_data) params = Marshal.load(serialized_data) - puts "[PopulateSnapshotJob] I am going to populate the snapshot with #{params[:snapshot][:queries].size} queries" - - service = SnapshotManager.new(snapshot) + service = SnapshotManagerCopy.new(snapshot) snapshot_docs = params[:snapshot][:docs] snapshot_queries = params[:snapshot][:queries] - service.add_docs snapshot_docs, snapshot_queries if snapshot_docs + service.add_docs snapshot_docs, snapshot_queries + + snapshot.reload # this appears to be required or we duplicate the snapshot_queries! snapshot.snapshot_file.purge - snapshot.save end # rubocop:enable Security/MarshalLoad end diff --git a/app/services/snapshot_manager_copy.rb b/app/services/snapshot_manager_copy.rb new file mode 100644 index 000000000..fc86405db --- /dev/null +++ b/app/services/snapshot_manager_copy.rb @@ -0,0 +1,260 @@ +# frozen_string_literal: true + +# rubocop:disable Metrics/ClassLength +class SnapshotManagerCopy + attr_reader :logger, :options + + def initialize snapshot, opts = {} + default_options = { + format: :csv, + logger: Rails.logger, + show_progress: false, + } + + @options = default_options.merge(opts.deep_symbolize_keys) + @logger = @options[:logger] + @snapshot = snapshot + end + + def show_progress? + options[:show_progress] + end + + # + # Adds docs to a snapshot, assuming snapshot is being created from the + # app and thus all the queries already exist for the case + # (so no need to create them). + # + # @param data, hash + # @return self + # + # Example: + # + # manager = SnapshotManager.new snapshot + # data = { + # 123 => [ + # { id: "doc1", explain: "1" }, + # { id: "doc2", explain: "2" }, + # ], + # 456 => [ + # { id: "doc3", explain: "3" }, + # { id: "doc4", explain: "4" }, + # ] + # } + # manager.add_docs data + # + def add_docs docs, queries + queries_to_import = [] + + keys = docs.nil? ? [] : docs.keys + + # Start by adding queries to snapshot. + # First, setup all queries to be added in an array. + # block_with_progress_bar(keys.length) do |i| + keys.length.times.each do |i| + query_id = keys[i] + + snapshot_query = @snapshot.snapshot_queries.where(query_id: query_id).first_or_initialize + + # Quepid front end can send -- as no score. + queries[query_id]['score'] = nil if '--' == queries[query_id]['score'] + snapshot_query.score = queries[query_id][:score] + snapshot_query.all_rated = queries[query_id][:all_rated] + snapshot_query.number_of_results = queries[query_id][:number_of_results] + + queries_to_import << snapshot_query + end + + # Second, mass insert queries. + SnapshotQuery.import queries_to_import + # End of queries import. + + # Then import docs for the queries that were just created. + # This method is shared with the `import_queries` method + # which does the same thing with a slightly different set of data. + import_docs keys, docs + + self + end + + # + # Imports queries and docs to a snapshot. + # If the query does not already exists, it adds it to the case first, + # then it adds it to the snapshot. + # + # @param queries, hash + # @return self + # + # Example: + # + # manager = SnapshotManager.new snapshot + # data = { + # "dog" => { + # docs: [ + # { id: "doc1", explain: "1", position: 1 }, + # { id: "doc2", explain: "2", position: 2 }, + # ] + # }, + # "cat" => { + # docs: [ + # { id: "doc3", explain: "3", position: 2 }, + # { id: "doc4", explain: "4", position: 1 }, + # ] + # } + # } + # manager.import_queries data + # + # rubocop:disable Metrics/MethodLength + def import_queries queries + queries_to_import = [] + keys = queries.keys + + # Fetch all queries for the snapshot's case where the query text + # matches the keys in the hash supplied in the params. + queries_params = { + query_text: keys, + case_id: @snapshot.case_id, + } + indexed_queries = Query.where(queries_params) + .all + .index_by(&:query_text) + + # Start by adding queries to snapshot. + # First, setup all queries to be added in an array. + # print_step 'Importing queries' + # block_with_progress_bar(keys.length) do |i| + keys.length.times.each do |i| + query_text = keys[i] + query = fetch_or_create_query indexed_queries, query_text + + snapshot_query = @snapshot.snapshot_queries.where(query_id: query.id).first_or_initialize + + queries[query.id] = queries.delete(keys[i]) + + queries_to_import << snapshot_query + end + + # Second, mass insert queries. + SnapshotQuery.import queries_to_import + # End of queries import. + + # Updates keys after we switched them out from the text to the id + keys = queries.keys + data = {} + queries.each { |key, q| data[key] = q[:docs] || q['docs'] } + + # Then import docs for the queries that were just created. + import_docs keys, data + + self + end + # rubocop:enable Metrics/MethodLength + + def csv_to_queries_hash docs + # print_step 'Transforming csv into a queries hash' + + query_docs = {} + # block_with_progress_bar(docs.length) do |i| + docs.length.times.each do |i| + row = extract_doc_info docs[i] + query_docs[row[:query_text]] ||= { docs: [] } + query_docs[row[:query_text]][:docs] << row + end + + query_docs + end + + private + + # rubocop:disable Metrics/MethodLength + def setup_docs_for_query query, docs + results = [] + + return results if docs.blank? + return results if query.blank? + + docs = normalize_docs_array docs + docs = docs.sort { |d1, d2| d1[:position].to_i <=> d2[:position].to_i } + + docs.each_with_index do |doc, index| + doc_params = { + doc_id: doc[:id], + explain: doc[:explain], + position: doc[:position] || (index + 1), + rated_only: doc[:rated_only] || false, + fields: doc[:fields].blank? ? nil : doc[:fields].to_json, + } + + results << query.snapshot_docs.build(doc_params) + end + + results + end + # rubocop:enable Metrics/MethodLength + + def extract_doc_info row + case @options[:format] + when :csv + { + query_text: row[0], + id: row[1], + position: row[2], + } + when :hash + row.deep_symbolize_keys + else + row + end + end + + def normalize_docs_array docs + return [] if docs.blank? + + result = docs.map do |each| + each = each.to_unsafe_h if each.is_a?(ActionController::Parameters) + each = each.to_hash if each.is_a?(ActiveSupport::HashWithIndifferentAccess) + + each.symbolize_keys! if each.present? + end.compact + + result + end + + def import_docs keys, data + docs_to_import = [] + + indexed_snap_queries = @snapshot.snapshot_queries + .where(query_id: keys) + .all + .index_by { |q| q.query_id.to_s } + + # print_step 'Importing docs' + # block_with_progress_bar(keys.length) do |i| + keys.length.times.each do |i| + query_id = keys[i] + docs = data[keys[i]] + + snapshot_query = indexed_snap_queries[query_id.to_s] + query_docs = setup_docs_for_query snapshot_query, docs + + docs_to_import += query_docs + end + + SnapshotDoc.import docs_to_import + + self + end + + def fetch_or_create_query indexed_queries, query_text + if indexed_queries[query_text].present? + indexed_queries[query_text] + else + query_params = { + query_text: query_text, + case_id: @snapshot.case_id, + } + Query.create(query_params) + end + end +end +# rubocop:enable Metrics/ClassLength diff --git a/app/views/api/v1/snapshots/_snapshot.json.jbuilder b/app/views/api/v1/snapshots/_snapshot.json.jbuilder index f556ca5ac..0c261afc3 100644 --- a/app/views/api/v1/snapshots/_snapshot.json.jbuilder +++ b/app/views/api/v1/snapshots/_snapshot.json.jbuilder @@ -5,7 +5,7 @@ shallow ||= false json.id snapshot.id json.name snapshot.name json.time snapshot.created_at -json.has_snapshot_file = snapshot.snapshot_file.present? +json.has_snapshot_file snapshot.snapshot_file.present? unless shallow json.scorer snapshot.scorer, partial: 'api/v1/scorers/communal_scorer', as: :scorer diff --git a/test/controllers/api/v1/snapshots_controller_test.rb b/test/controllers/api/v1/snapshots_controller_test.rb index ff9d07150..84f21d5bb 100644 --- a/test/controllers/api/v1/snapshots_controller_test.rb +++ b/test/controllers/api/v1/snapshots_controller_test.rb @@ -24,12 +24,12 @@ class SnapshotsControllerTest < ActionController::TestCase name: 'New Snapshot', docs: { first_query.id => [ - { id: 'doc1', explain: 1 }, - { id: 'doc2', explain: 2 } + { id: 'doc1', explain: '1' }, + { id: 'doc2', explain: '2' } ], second_query.id => [ - { id: 'doc3', explain: 3 }, - { id: 'doc4', explain: 4 } + { id: 'doc3', explain: '3' }, + { id: 'doc4', explain: '4' } ], }, queries: { @@ -49,27 +49,37 @@ class SnapshotsControllerTest < ActionController::TestCase post :create, params: data.merge(case_id: acase.id) assert_response :ok - snapshot = response.parsed_body + snapshot_response = response.parsed_body - assert_not_nil snapshot['time'] + assert_not_nil snapshot_response['scorer'] + assert_not_nil snapshot_response['try'] + assert_not_nil snapshot_response['time'] + assert_not_nil snapshot_response['id'] + assert_equal snapshot_response['name'], data[:snapshot][:name] - assert_equal snapshot['name'], data[:snapshot][:name] - assert_equal snapshot['docs'].length, data[:snapshot][:docs].length + perform_enqueued_jobs - data_doc = data[:snapshot][:docs][first_query.id][0] - response_doc = snapshot['docs'][first_query.id.to_s][0] + snapshot = Snapshot.find(snapshot_response['id']) - assert_equal data_doc[:id], response_doc['id'] - assert_equal data_doc[:explain], response_doc['explain'] + assert_equal snapshot.snapshot_docs.length, data[:snapshot][:docs].values.flatten.count + data_doc = data[:snapshot][:docs][first_query.id][0] + snapshot_query = snapshot.snapshot_queries.where(query_id: first_query.id).first + + response_doc = snapshot_query.snapshot_docs[0] + + assert_equal data_doc[:id], response_doc.doc_id + assert_equal data_doc[:explain], response_doc.explain + + snapshot_query = snapshot.snapshot_queries.where(query: second_query).first data_doc = data[:snapshot][:docs][second_query.id][0] - response_doc = snapshot['docs'][second_query.id.to_s][0] + response_doc = snapshot_query.snapshot_docs[0] - assert_equal data_doc[:id], response_doc['id'] - assert_equal data_doc[:explain], response_doc['explain'] + assert_equal data_doc[:id], response_doc.doc_id + assert_equal data_doc[:explain], response_doc.explain - assert_not_nil snapshot['scorer'] - assert_not_nil snapshot['try'] + assert_not_nil snapshot.scorer + assert_not_nil snapshot.try end end @@ -79,8 +89,8 @@ class SnapshotsControllerTest < ActionController::TestCase name: 'New Snapshot', docs: { first_query.id => [ - { id: 'doc1', explain: 1 }, - { id: 'doc2', explain: 2 } + { id: 'doc1', explain: '1' }, + { id: 'doc2', explain: '2' } ], # in Rails 4, we could do second_query.id => [] and getting the second_query in, # but in Rails 5, the second_query doesn't show up because the array that is empty @@ -105,20 +115,39 @@ class SnapshotsControllerTest < ActionController::TestCase assert_response :ok - snapshot = response.parsed_body + snapshot_response = response.parsed_body + + assert_equal data[:snapshot][:name], snapshot_response['name'] + assert_not_nil snapshot_response['id'] + + snapshot = Snapshot.find(snapshot_response['id']) + assert snapshot.snapshot_file.present? + assert_equal snapshot.snapshot_queries.length, 0 + + perform_enqueued_jobs - assert_equal data[:snapshot][:name], snapshot['name'] - assert_equal data[:snapshot][:docs].length, snapshot['docs'].length + snapshot = Snapshot.find(snapshot_response['id']) + assert_not snapshot.snapshot_file.present? + assert_equal snapshot.snapshot_queries.length, 2 + assert_equal data[:snapshot][:docs].length, snapshot.snapshot_docs.length - data_doc = data[:snapshot][:docs][first_query.id][0] - response_doc = snapshot['docs'][first_query.id.to_s][0] + data_doc = data[:snapshot][:docs][first_query.id][0] - assert_equal data_doc[:id], response_doc['id'] - assert_equal data_doc[:explain], response_doc['explain'] + snapshot_query = snapshot.snapshot_queries.where(query_id: first_query.id).first - response_docs = snapshot['docs'][second_query.id.to_s] + response_doc = snapshot_query.snapshot_docs[0] + + assert_equal data_doc[:id], response_doc.doc_id + assert_equal data_doc[:explain], response_doc.explain + + snapshot_query = snapshot.snapshot_queries.where(query_id: second_query.id).first + response_docs = snapshot_query.snapshot_docs assert_empty response_docs + + perform_enqueued_jobs + + assert_equal snapshot.snapshot_queries.length, 2 end end @@ -216,8 +245,9 @@ class SnapshotsControllerTest < ActionController::TestCase let(:data) do { snapshot: { - name: 'New Snapshot', - docs: {}, + name: 'New Snapshot', + docs: {}, + queries: {}, }, } end diff --git a/test/services/snapshot_manager_copy_test.rb b/test/services/snapshot_manager_copy_test.rb new file mode 100644 index 000000000..84f649b14 --- /dev/null +++ b/test/services/snapshot_manager_copy_test.rb @@ -0,0 +1,264 @@ +# frozen_string_literal: true + +require 'test_helper' + +class SnapshotManagerCopyTest < ActiveSupport::TestCase + let(:snapshot) { snapshots(:empty_snapshot) } + let(:first_query) { queries(:a_query) } + let(:second_query) { queries(:b_query) } + + let(:service) { SnapshotManagerCopy.new(snapshot) } + + describe 'Adds docs to a snapshot' do + test 'adds snapshot to case' do + sample_fields = { title: 'some title', 'thumb:product_image': 'http://example.com/image.png' } + + data = { + docs: { + first_query.id => [ + { id: 'doc1', explain: '1', fields: sample_fields }, + { id: 'doc2', explain: '2', fields: sample_fields } + ], + second_query.id => [ + { id: 'doc3', explain: '3', fields: sample_fields }, + { id: 'doc4', explain: '4', fields: sample_fields } + ], + }, + queries: { + first_query.id => { + score: 0.87, + all_rated: true, + number_of_results: 42, + }, + second_query.id => { + score: 0.45, + all_rated: false, + number_of_results: nil, + }, + }, + } + + assert_difference 'snapshot.snapshot_queries.count', 2 do + service.add_docs data[:docs], data[:queries] + + # This is needed or else we get wrong numbers + snapshot.reload + snapshot_queries = snapshot.snapshot_queries + + assert_equal snapshot_queries.length, data[:docs].length + + first_snapshot_query = snapshot_queries.where(query_id: first_query.id).first + second_snapshot_query = snapshot_queries.where(query_id: second_query.id).first + + assert_not_nil first_snapshot_query + assert_equal first_snapshot_query.query_id, first_query.id + assert_equal first_snapshot_query.score, 0.87 + assert_equal true, first_snapshot_query.all_rated + assert_equal 42, first_snapshot_query.number_of_results + + assert_not_nil second_snapshot_query + assert_equal second_snapshot_query.query_id, second_query.id + assert_equal second_snapshot_query.score, 0.45 + assert_equal false, second_snapshot_query.all_rated + assert_nil second_snapshot_query.number_of_results + + data_doc = data[:docs][first_query.id][0] + response_doc = first_snapshot_query.snapshot_docs[0] + + assert_equal data_doc[:id], response_doc.doc_id + assert_equal data_doc[:explain], response_doc.explain + assert_equal 1, response_doc.position + assert_equal sample_fields.to_json, response_doc.fields + + data_doc = data[:docs][second_query.id][0] + response_doc = second_snapshot_query.snapshot_docs[0] + + assert_equal data_doc[:id], response_doc.doc_id + assert_equal data_doc[:explain], response_doc.explain + assert_equal 1, response_doc.position + assert_equal sample_fields.to_json, response_doc.fields + end + end + end + + describe 'Import queries' do + test 'creates queries if they do not already exist' do + data = { + 'dog' => { + docs: [ + { id: 'doc1', explain: '1', position: 1 }, + { id: 'doc2', explain: '2', position: 2 } + ], + }, + 'cat' => { + docs: [ + { id: 'doc3', explain: '3', position: 2 }, + { id: 'doc4', explain: '4', position: 1 } + ], + }, + } + + assert_difference 'Query.count', 2 do + service.import_queries data + + # This is needed or else we get wrong numbers + snapshot.reload + queries = snapshot.snapshot_queries + + assert_equal queries.length, data.length + + first_query = Query.where(query_text: 'dog', case_id: snapshot.case_id).first + second_query = Query.where(query_text: 'cat', case_id: snapshot.case_id).first + + first_snapshot_query = queries.where(query_id: first_query.id).first + second_snapshot_query = queries.where(query_id: second_query.id).first + + assert_not_nil first_snapshot_query + assert_equal first_snapshot_query.query_id, first_query.id + + assert_not_nil second_snapshot_query + assert_equal second_snapshot_query.query_id, second_query.id + + data_doc = data[first_query.id][:docs][0] + response_doc = first_snapshot_query.snapshot_docs[0] + + assert_equal data_doc[:id], response_doc.doc_id + assert_equal data_doc[:explain], response_doc.explain + assert_equal data_doc[:position], response_doc.position + assert_equal 1, response_doc.position + + data_doc = data[second_query.id][:docs][0] + response_doc = second_snapshot_query.snapshot_docs[1] + + assert_equal data_doc[:id], response_doc.doc_id + assert_equal data_doc[:explain], response_doc.explain + assert_equal data_doc[:position], response_doc.position + assert_equal 2, response_doc.position + end + end + + test 'does not create queries if they already exist' do + data = { + first_query.query_text => { + docs: [ + { id: 'doc1', explain: '1', position: 1 }, + { id: 'doc2', explain: '2', position: 2 } + ], + }, + second_query.query_text => { + docs: [ + { id: 'doc3', explain: '3', position: 2 }, + { id: 'doc4', explain: '4', position: 1 } + ], + }, + } + + assert_no_difference 'Query.count' do + service.import_queries data + + # This is needed or else we get wrong numbers + snapshot.reload + queries = snapshot.snapshot_queries + + assert_equal queries.length, data.length + + first_snapshot_query = queries.where(query_id: first_query.id).first + second_snapshot_query = queries.where(query_id: second_query.id).first + + assert_not_nil first_snapshot_query + assert_equal first_snapshot_query.query_id, first_query.id + + assert_not_nil second_snapshot_query + assert_equal second_snapshot_query.query_id, second_query.id + + data_doc = data[first_query.id][:docs][0] + response_doc = first_snapshot_query.snapshot_docs[0] + + assert_equal data_doc[:id], response_doc.doc_id + assert_equal data_doc[:explain], response_doc.explain + assert_equal data_doc[:position], response_doc.position + assert_equal 1, response_doc.position + + data_doc = data[second_query.id][:docs][0] + response_doc = second_snapshot_query.snapshot_docs[1] + + assert_equal data_doc[:id], response_doc.doc_id + assert_equal data_doc[:explain], response_doc.explain + assert_equal data_doc[:position], response_doc.position + assert_equal 2, response_doc.position + end + end + + test 'handles query data with empty docs' do + data = { + first_query.query_text => { + docs: [], + }, + second_query.query_text => { + docs: [], + }, + } + + service.import_queries data + + # This is needed or else we get wrong numbers + snapshot.reload + queries = snapshot.snapshot_queries + + assert_equal queries.length, data.length + + first_snapshot_query = queries.where(query_id: first_query.id).first + second_snapshot_query = queries.where(query_id: second_query.id).first + + assert_not_nil first_snapshot_query + assert_not_nil second_snapshot_query + + data_doc = data[first_query.id][:docs] + response_doc = first_snapshot_query.snapshot_docs + + assert_equal data_doc.length, response_doc.length + + data_doc = data[second_query.id][:docs] + response_doc = second_snapshot_query.snapshot_docs + + assert_equal data_doc.length, response_doc.length + end + + test 'handles query data with array of nil docs' do + data = { + first_query.query_text => { + docs: [ nil, nil ], + }, + second_query.query_text => { + docs: [ + { id: 'doc3', explain: '3', position: 2 }, + { id: 'doc4', explain: '4', position: 1 } + ], + }, + } + + service.import_queries data + + # This is needed or else we get wrong numbers + snapshot.reload + queries = snapshot.snapshot_queries + + assert_equal queries.length, data.length + + first_snapshot_query = queries.where(query_id: first_query.id).first + second_snapshot_query = queries.where(query_id: second_query.id).first + + assert_not_nil first_snapshot_query + assert_not_nil second_snapshot_query + + response_doc = first_snapshot_query.snapshot_docs + + assert_equal 0, response_doc.length # coz they were all nils + + data_doc = data[second_query.id][:docs] + response_doc = second_snapshot_query.snapshot_docs + + assert_equal data_doc.length, response_doc.length + end + end +end