From 677bf00769bad1b27590fc163a7c0890a00a346b Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Wed, 25 Oct 2023 16:03:10 -0400 Subject: [PATCH 01/19] Group text runs into paragraphs --- lib/pdf/reader/page.rb | 112 +++++++++++++++++++++++++++++++++++++ lib/pdf/reader/text_run.rb | 11 ++++ spec/page_spec.rb | 28 ++++++++++ 3 files changed, 151 insertions(+) diff --git a/lib/pdf/reader/page.rb b/lib/pdf/reader/page.rb index d48768ad..64e34898 100644 --- a/lib/pdf/reader/page.rb +++ b/lib/pdf/reader/page.rb @@ -231,6 +231,45 @@ def rectangles } end + # returns all text on the page as an array of Paragraphs. + def paragraphs(opts = {}) + minimum_horizontal_overlap_percentage = opts.fetch(:minimum_horizontal_overlap_percentage, 0.80) + maximum_multiplied_leading = opts.fetch(:maximum_multiplied_leading, 1.40) + # maximum_allowed_font_difference = opts.fetch(:maximum_allowed_font_difference, 1.00) + + receiver = PageTextReceiver.new + walk(receiver) + runs = receiver.runs(opts) + + disjoint_set = DisjointSet.new + runs.each { |run| disjoint_set.add(run) } + + disjoint_set.each do |l0| + disjoint_set.each do |l1| + next if l0 == l1 + next if disjoint_set.find(l0) == disjoint_set.find(l1) + + overlap_percentage = l0.horizontal_overlap(l1) + leading = (l0.y - l1.y).abs / [l0.font_size, l1.font_size].min + + if overlap_percentage >= minimum_horizontal_overlap_percentage && leading <= maximum_multiplied_leading + disjoint_set.union(l0, l1) && next + end + end + end + + paragraphs = disjoint_set.sets.map do |set| + # remember, pdf page origin is bottom left corner + leftmost_x = set.map(&:x).min + topmost_y = set.map(&:y).max + text = set.map(&:text).join(' ') + + Paragraph.new(text, PDF::Reader::Point.new(leftmost_x, topmost_y)) + end + + paragraphs.map(&:text) + end + private def root @@ -314,3 +353,76 @@ def select_inheritable(obj) end end end + +Paragraph = Struct.new('Paragraph', :text, :origin) + +# In computer science, a disjoint-set data structure, also called a union–find data structure or merge–find set, +# is a data structure that stores a collection of disjoint (non-overlapping) sets. +class DisjointSet + include Enumerable + + def initialize + @parents = {} + @ranks = {} + end + + def contains(item) + @parents.key?(item) + end + + def each(&block) + if block_given? + @parents.each_key(&block) + else + to_enum(:each) + end + end + + def length + @parents.length + end + + def add(x) + @parents[x] = x + @ranks[x] = 0 + self + end + + def find(x) + return x if @parents[x] == x + + find(@parents[x]) + end + + def pop(x) + raise NotImplementedError, "Remove operation not supported" + end + + def sets + cluster_parents = {} + @parents.each_key do |x| + p = find(x) + cluster_parents[p] = [] unless cluster_parents.key?(p) + cluster_parents[p].push(x) + end + cluster_parents.values + end + + def union(x, y) + x_parent = find(x) + y_parent = find(y) + + return self if x_parent == y_parent + + if @ranks[x_parent] > @ranks[y_parent] + @parents[y_parent] = x_parent + elsif @ranks[y_parent] > @ranks[x_parent] + @parents[x_parent] = y_parent + else + @parents[y_parent] = x_parent + @ranks[x_parent] += 1 + end + + self + end +end diff --git a/lib/pdf/reader/text_run.rb b/lib/pdf/reader/text_run.rb index 9daa0b44..12bfa814 100644 --- a/lib/pdf/reader/text_run.rb +++ b/lib/pdf/reader/text_run.rb @@ -91,6 +91,17 @@ def intersection_area_percent(other_run) intersection_area.to_f / area end + # return what percentage of this text run is overlapped by another run horizontally + def horizontal_overlap(other_run) + # rectangles do not overlap (we are on the left side) + return 0 if [x, endx].max < [other_run.x, other_run.endx].min + # rectangles do not overlap (other_run is on the left side) + return 0 if [other_run.x, other_run.endx].max < [x, endx].min + a = [ [x, endx].min, [other_run.x, other_run.endx].min ].max + b = [ [x, endx].max, [other_run.x, other_run.endx].max ].min + return (a - b).abs + end + private def area diff --git a/spec/page_spec.rb b/spec/page_spec.rb index 1119ddf7..ec9ce340 100644 --- a/spec/page_spec.rb +++ b/spec/page_spec.rb @@ -110,6 +110,34 @@ end end + describe "#paragraphs page 1" do + let!(:page) { browser.page(1) } + + context "of cairo-basic.pdf" do + let!(:browser) { PDF::Reader.new(pdf_spec_file("cairo-basic")) } + + it "returns the text content" do + expect(page.paragraphs).to eql(["Hello James"]) + end + end + + context "of all_page_boxes_exist.pdf" do + let!(:browser) { PDF::Reader.new(pdf_spec_file("all_page_boxes_exist")) } + + it "returns headlines as their own paragraph" do + expect(page.paragraphs).to include("PDF Automation") + end + + it "returns actual paragraphs" do + expect(page.paragraphs).to include(<<~TEXT.strip.gsub(/\n/, " ")) + PDF page boxes include Media Box, Trim Box and Bleed Box. Imposition + in the Sheridan work flow requires a Trim Box and a Bleed Box where + bleeds are present with a consistent Media Box. + TEXT + end + end + end + describe "#walk" do context "with page 1 of cairo-basic.pdf" do From 23e0929c797572ccf2a1f3642a8eb211f2c3aa11 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Wed, 25 Oct 2023 16:03:10 -0400 Subject: [PATCH 02/19] Move DisjointSet to file --- lib/pdf/reader.rb | 1 + lib/pdf/reader/disjoint_set.rb | 79 ++++++++++++++++++++++++++++++++++ lib/pdf/reader/page.rb | 72 +------------------------------ 3 files changed, 81 insertions(+), 71 deletions(-) create mode 100644 lib/pdf/reader/disjoint_set.rb diff --git a/lib/pdf/reader.rb b/lib/pdf/reader.rb index 96bef30e..eb787cd6 100644 --- a/lib/pdf/reader.rb +++ b/lib/pdf/reader.rb @@ -284,6 +284,7 @@ def root require 'pdf/reader/bounding_rectangle_runs_filter' require 'pdf/reader/cid_widths' require 'pdf/reader/cmap' +require 'pdf/reader/disjoint_set' require 'pdf/reader/encoding' require 'pdf/reader/error' require 'pdf/reader/filter' diff --git a/lib/pdf/reader/disjoint_set.rb b/lib/pdf/reader/disjoint_set.rb new file mode 100644 index 00000000..f3279829 --- /dev/null +++ b/lib/pdf/reader/disjoint_set.rb @@ -0,0 +1,79 @@ +# coding: utf-8 +# typed: strict +# frozen_string_literal: true + +module PDF + class Reader + + # In computer science, a disjoint-set data structure, also called a union–find data structure or merge–find set, + # is a data structure that stores a collection of disjoint (non-overlapping) sets. + class DisjointSet + include Enumerable + + def initialize + @parents = {} + @ranks = {} + end + + def contains(item) + @parents.key?(item) + end + + def each(&block) + if block_given? + @parents.each_key(&block) + else + to_enum(:each) + end + end + + def length + @parents.length + end + + def add(x) + @parents[x] = x + @ranks[x] = 0 + self + end + + def find(x) + return x if @parents[x] == x + + find(@parents[x]) + end + + def pop(x) + raise NotImplementedError, "Remove operation not supported" + end + + def sets + cluster_parents = {} + @parents.each_key do |x| + p = find(x) + cluster_parents[p] = [] unless cluster_parents.key?(p) + cluster_parents[p].push(x) + end + cluster_parents.values + end + + def union(x, y) + x_parent = find(x) + y_parent = find(y) + + return self if x_parent == y_parent + + if @ranks[x_parent] > @ranks[y_parent] + @parents[y_parent] = x_parent + elsif @ranks[y_parent] > @ranks[x_parent] + @parents[x_parent] = y_parent + else + @parents[y_parent] = x_parent + @ranks[x_parent] += 1 + end + + self + end + end + end +end diff --git a/lib/pdf/reader/page.rb b/lib/pdf/reader/page.rb index 64e34898..71ab083a 100644 --- a/lib/pdf/reader/page.rb +++ b/lib/pdf/reader/page.rb @@ -241,7 +241,7 @@ def paragraphs(opts = {}) walk(receiver) runs = receiver.runs(opts) - disjoint_set = DisjointSet.new + disjoint_set = PDF::Reader::DisjointSet.new runs.each { |run| disjoint_set.add(run) } disjoint_set.each do |l0| @@ -356,73 +356,3 @@ def select_inheritable(obj) Paragraph = Struct.new('Paragraph', :text, :origin) -# In computer science, a disjoint-set data structure, also called a union–find data structure or merge–find set, -# is a data structure that stores a collection of disjoint (non-overlapping) sets. -class DisjointSet - include Enumerable - - def initialize - @parents = {} - @ranks = {} - end - - def contains(item) - @parents.key?(item) - end - - def each(&block) - if block_given? - @parents.each_key(&block) - else - to_enum(:each) - end - end - - def length - @parents.length - end - - def add(x) - @parents[x] = x - @ranks[x] = 0 - self - end - - def find(x) - return x if @parents[x] == x - - find(@parents[x]) - end - - def pop(x) - raise NotImplementedError, "Remove operation not supported" - end - - def sets - cluster_parents = {} - @parents.each_key do |x| - p = find(x) - cluster_parents[p] = [] unless cluster_parents.key?(p) - cluster_parents[p].push(x) - end - cluster_parents.values - end - - def union(x, y) - x_parent = find(x) - y_parent = find(y) - - return self if x_parent == y_parent - - if @ranks[x_parent] > @ranks[y_parent] - @parents[y_parent] = x_parent - elsif @ranks[y_parent] > @ranks[x_parent] - @parents[x_parent] = y_parent - else - @parents[y_parent] = x_parent - @ranks[x_parent] += 1 - end - - self - end -end From 8c7cd4be5f4feab85e70a032a9bf30717855dfe7 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Wed, 25 Oct 2023 16:03:10 -0400 Subject: [PATCH 03/19] Move paragraph struct up to where it's being used. --- lib/pdf/reader/page.rb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/pdf/reader/page.rb b/lib/pdf/reader/page.rb index 71ab083a..336c4d0a 100644 --- a/lib/pdf/reader/page.rb +++ b/lib/pdf/reader/page.rb @@ -232,6 +232,7 @@ def rectangles end # returns all text on the page as an array of Paragraphs. + Paragraph = Struct.new('Paragraph', :text, :origin) def paragraphs(opts = {}) minimum_horizontal_overlap_percentage = opts.fetch(:minimum_horizontal_overlap_percentage, 0.80) maximum_multiplied_leading = opts.fetch(:maximum_multiplied_leading, 1.40) @@ -353,6 +354,3 @@ def select_inheritable(obj) end end end - -Paragraph = Struct.new('Paragraph', :text, :origin) - From 7eb4142d7ebd72140b9e818427b9af27126fbc36 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Wed, 25 Oct 2023 16:03:10 -0400 Subject: [PATCH 04/19] Refactor --- lib/pdf/reader/page.rb | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/lib/pdf/reader/page.rb b/lib/pdf/reader/page.rb index 336c4d0a..05dadde9 100644 --- a/lib/pdf/reader/page.rb +++ b/lib/pdf/reader/page.rb @@ -238,13 +238,11 @@ def paragraphs(opts = {}) maximum_multiplied_leading = opts.fetch(:maximum_multiplied_leading, 1.40) # maximum_allowed_font_difference = opts.fetch(:maximum_allowed_font_difference, 1.00) - receiver = PageTextReceiver.new - walk(receiver) - runs = receiver.runs(opts) - disjoint_set = PDF::Reader::DisjointSet.new - runs.each { |run| disjoint_set.add(run) } + runs(opts).each { |run| disjoint_set.add(run) } + # Build disjoint set in order to find all text runs that "overlap" by a + # certain percentage, so we can combine the right runs together. disjoint_set.each do |l0| disjoint_set.each do |l1| next if l0 == l1 @@ -253,9 +251,10 @@ def paragraphs(opts = {}) overlap_percentage = l0.horizontal_overlap(l1) leading = (l0.y - l1.y).abs / [l0.font_size, l1.font_size].min - if overlap_percentage >= minimum_horizontal_overlap_percentage && leading <= maximum_multiplied_leading - disjoint_set.union(l0, l1) && next - end + next unless overlap_percentage >= minimum_horizontal_overlap_percentage + next unless leading <= maximum_multiplied_leading + + disjoint_set.union(l0, l1) end end From 53191bcafff63b05b1655ef5d65c0a2b4a06cff8 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Wed, 25 Oct 2023 16:03:10 -0400 Subject: [PATCH 05/19] Strip whitespace before joining into paragraphs --- lib/pdf/reader/page.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pdf/reader/page.rb b/lib/pdf/reader/page.rb index 05dadde9..ccdd3c80 100644 --- a/lib/pdf/reader/page.rb +++ b/lib/pdf/reader/page.rb @@ -262,7 +262,7 @@ def paragraphs(opts = {}) # remember, pdf page origin is bottom left corner leftmost_x = set.map(&:x).min topmost_y = set.map(&:y).max - text = set.map(&:text).join(' ') + text = set.map { |run| run.text.strip }.join(' ') Paragraph.new(text, PDF::Reader::Point.new(leftmost_x, topmost_y)) end From 913715d9298ce7ad1fe9430fda857e6aa6169cf4 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Wed, 25 Oct 2023 16:03:10 -0400 Subject: [PATCH 06/19] Use font size difference to keep headlines apart from paragraphs --- lib/pdf/reader/page.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/pdf/reader/page.rb b/lib/pdf/reader/page.rb index ccdd3c80..2a8e0f99 100644 --- a/lib/pdf/reader/page.rb +++ b/lib/pdf/reader/page.rb @@ -236,7 +236,7 @@ def rectangles def paragraphs(opts = {}) minimum_horizontal_overlap_percentage = opts.fetch(:minimum_horizontal_overlap_percentage, 0.80) maximum_multiplied_leading = opts.fetch(:maximum_multiplied_leading, 1.40) - # maximum_allowed_font_difference = opts.fetch(:maximum_allowed_font_difference, 1.00) + maximum_allowed_font_difference = opts.fetch(:maximum_allowed_font_difference, 1.00) disjoint_set = PDF::Reader::DisjointSet.new runs(opts).each { |run| disjoint_set.add(run) } @@ -253,6 +253,7 @@ def paragraphs(opts = {}) next unless overlap_percentage >= minimum_horizontal_overlap_percentage next unless leading <= maximum_multiplied_leading + next if (l0.font_size - l1.font_size).abs > maximum_allowed_font_difference disjoint_set.union(l0, l1) end From f209829b9e7eba93b0682bef86b8f5a66f5f325c Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Wed, 25 Oct 2023 16:03:10 -0400 Subject: [PATCH 07/19] Add test for multi-column layout --- spec/page_spec.rb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spec/page_spec.rb b/spec/page_spec.rb index ec9ce340..62b87d16 100644 --- a/spec/page_spec.rb +++ b/spec/page_spec.rb @@ -135,6 +135,15 @@ bleeds are present with a consistent Media Box. TEXT end + + it "returns paragraphs from multi-column layouts" do + puts page.paragraphs.inspect + expect(page.paragraphs).to include(<<~TEXT.strip.gsub(/\n/, " ")) + Enter your trim size of the Width and the Height. Elements that bleed + must extend .125" (1/8")beyond the project’s trim edge in your project + layout. + TEXT + end end end From de6266e8d3a361bb0dca80b177574cec628064be Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Mon, 30 Oct 2023 11:32:02 -0400 Subject: [PATCH 08/19] Remove unimplemented method --- lib/pdf/reader/disjoint_set.rb | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/pdf/reader/disjoint_set.rb b/lib/pdf/reader/disjoint_set.rb index f3279829..ebda4a5c 100644 --- a/lib/pdf/reader/disjoint_set.rb +++ b/lib/pdf/reader/disjoint_set.rb @@ -43,10 +43,6 @@ def find(x) find(@parents[x]) end - def pop(x) - raise NotImplementedError, "Remove operation not supported" - end - def sets cluster_parents = {} @parents.each_key do |x| From b3716790c336e8a66517b923666ae78c81c26ac9 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Mon, 30 Oct 2023 11:32:40 -0400 Subject: [PATCH 09/19] First pass at implementing types for DisjointSet --- rbi/pdf-reader.rbi | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/rbi/pdf-reader.rbi b/rbi/pdf-reader.rbi index 07233fc1..2c9ca03f 100644 --- a/rbi/pdf-reader.rbi +++ b/rbi/pdf-reader.rbi @@ -205,6 +205,32 @@ module PDF def bfrange_type_two(start_code, end_code, dst); end end + class DisjointSet + sig { void } + def initialize + @parents = T.let({}, T::Hash[T.untyped, T.untyped]) + @ranks = T.let({}, T::Hash[T.untyped, T.untyped]) + end + + sig { params(x: T.untyped).returns(T::Boolean) } + def contains(item); end + + sig { override.params(block: T.proc.params(arg0: Elem).returns(BasicObject)).returns(T.untyped) } + def each(&block); end + + sig { params(x: T.untyped).returns(PDF::Reader::DisjointSet) } + def add(x); end + + sig { params(x: T.proc.returns(T.type_parameter(:U))).returns(T.any(Elem, T.type_parameter(:U))) } + def find(x); end + + sig { returns(T::Array[T.untyped]) } + def sets; end + + sig { params(x: T.untyped, y: T.untyped).returns(PDF::Reader::DisjointSet) } + def union(x, y); end + end + class Encoding CONTROL_CHARS = T.let(T.unsafe(nil), T::Array[Integer]) UNKNOWN_CHAR = T.let(T.unsafe(nil), Integer) From df88380c01065572fa7290c10c9104bd33ff60ae Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Mon, 30 Oct 2023 12:07:17 -0400 Subject: [PATCH 10/19] Move Paragraph to dedicated class --- lib/pdf/reader/page.rb | 1 - lib/pdf/reader/paragraph.rb | 16 ++++++++++++++++ rbi/pdf-reader.rbi | 8 ++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 lib/pdf/reader/paragraph.rb diff --git a/lib/pdf/reader/page.rb b/lib/pdf/reader/page.rb index 2a8e0f99..5015d8c3 100644 --- a/lib/pdf/reader/page.rb +++ b/lib/pdf/reader/page.rb @@ -232,7 +232,6 @@ def rectangles end # returns all text on the page as an array of Paragraphs. - Paragraph = Struct.new('Paragraph', :text, :origin) def paragraphs(opts = {}) minimum_horizontal_overlap_percentage = opts.fetch(:minimum_horizontal_overlap_percentage, 0.80) maximum_multiplied_leading = opts.fetch(:maximum_multiplied_leading, 1.40) diff --git a/lib/pdf/reader/paragraph.rb b/lib/pdf/reader/paragraph.rb new file mode 100644 index 00000000..ae7e42f0 --- /dev/null +++ b/lib/pdf/reader/paragraph.rb @@ -0,0 +1,16 @@ +# coding: utf-8 +# typed: strict +# frozen_string_literal: true + +module PDF + class Reader + class Paragraph + attr_reader :text, :origin + + def initialize(text, origin) + @text = text + @origin = origin + end + end + end +end diff --git a/rbi/pdf-reader.rbi b/rbi/pdf-reader.rbi index 2c9ca03f..1ac1a674 100644 --- a/rbi/pdf-reader.rbi +++ b/rbi/pdf-reader.rbi @@ -1224,6 +1224,14 @@ module PDF OPERATORS = T.let(T.unsafe(nil), T::Hash[String, Symbol]) end + class Paragraph + sig { params(text: String, origin: PDF::Reader::Point).void } + def initialize(text, origin) + @text = T.let(T.unsafe(nil), String) + @origin = T.let(T.unsafe(nil), PDF::Reader::Point) + end + end + class Parser sig { params(buffer: PDF::Reader::Buffer, objects: T.nilable(PDF::Reader::ObjectHash)).void } def initialize(buffer, objects=nil); end From 0071adbe615c065e13c6133fbf1e82fda230ca65 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Mon, 30 Oct 2023 12:07:32 -0400 Subject: [PATCH 11/19] More type fixes --- rbi/pdf-reader.rbi | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/rbi/pdf-reader.rbi b/rbi/pdf-reader.rbi index 1ac1a674..0664d675 100644 --- a/rbi/pdf-reader.rbi +++ b/rbi/pdf-reader.rbi @@ -212,12 +212,15 @@ module PDF @ranks = T.let({}, T::Hash[T.untyped, T.untyped]) end - sig { params(x: T.untyped).returns(T::Boolean) } + sig { params(item: T.untyped).returns(T::Boolean) } def contains(item); end sig { override.params(block: T.proc.params(arg0: Elem).returns(BasicObject)).returns(T.untyped) } def each(&block); end + sig { returns(Integer) } + def length; end + sig { params(x: T.untyped).returns(PDF::Reader::DisjointSet) } def add(x); end @@ -957,6 +960,9 @@ module PDF sig { returns(T::Hash[Symbol, PDF::Reader::Rectangle]) } def rectangles; end + sig { params(opts: T::Hash[Symbol, T.untyped]).returns(T::Array[String]) } + def paragraphs(opts = {}); end + sig { returns(T::Hash[Symbol, T.untyped]) } def root; end From 1a0b9934cb4eb6f76c9f4597dca84a0686e46864 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Mon, 30 Oct 2023 12:29:36 -0400 Subject: [PATCH 12/19] Fix spec and paragraph init --- lib/pdf/reader.rb | 1 + lib/pdf/reader/page.rb | 2 +- spec/page_spec.rb | 5 ++--- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/pdf/reader.rb b/lib/pdf/reader.rb index eb787cd6..abd631b7 100644 --- a/lib/pdf/reader.rb +++ b/lib/pdf/reader.rb @@ -304,6 +304,7 @@ def root require 'pdf/reader/object_hash' require 'pdf/reader/object_stream' require 'pdf/reader/pages_strategy' +require 'pdf/reader/paragraph' require 'pdf/reader/parser' require 'pdf/reader/point' require 'pdf/reader/print_receiver' diff --git a/lib/pdf/reader/page.rb b/lib/pdf/reader/page.rb index 5015d8c3..23a16553 100644 --- a/lib/pdf/reader/page.rb +++ b/lib/pdf/reader/page.rb @@ -264,7 +264,7 @@ def paragraphs(opts = {}) topmost_y = set.map(&:y).max text = set.map { |run| run.text.strip }.join(' ') - Paragraph.new(text, PDF::Reader::Point.new(leftmost_x, topmost_y)) + PDF::Reader::Paragraph.new(text, PDF::Reader::Point.new(leftmost_x, topmost_y)) end paragraphs.map(&:text) diff --git a/spec/page_spec.rb b/spec/page_spec.rb index 62b87d16..50d60d6c 100644 --- a/spec/page_spec.rb +++ b/spec/page_spec.rb @@ -137,10 +137,9 @@ end it "returns paragraphs from multi-column layouts" do - puts page.paragraphs.inspect expect(page.paragraphs).to include(<<~TEXT.strip.gsub(/\n/, " ")) - Enter your trim size of the Width and the Height. Elements that bleed - must extend .125" (1/8")beyond the project’s trim edge in your project + QuarkXPress Enter your trim size of the Width and the Height. Elements that bleed + must extend .125" (1/8") beyond the project’s trim edge in your project layout. TEXT end From 01148838e2d6232efa1e14b8dfcb611e4d2fa6fc Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Mon, 30 Oct 2023 12:40:50 -0400 Subject: [PATCH 13/19] More type fixes --- lib/pdf/reader/paragraph.rb | 2 +- rbi/pdf-reader.rbi | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/lib/pdf/reader/paragraph.rb b/lib/pdf/reader/paragraph.rb index ae7e42f0..acf4f7ce 100644 --- a/lib/pdf/reader/paragraph.rb +++ b/lib/pdf/reader/paragraph.rb @@ -1,5 +1,5 @@ # coding: utf-8 -# typed: strict +# typed: true # frozen_string_literal: true module PDF diff --git a/rbi/pdf-reader.rbi b/rbi/pdf-reader.rbi index 0664d675..f97bf4a5 100644 --- a/rbi/pdf-reader.rbi +++ b/rbi/pdf-reader.rbi @@ -208,14 +208,14 @@ module PDF class DisjointSet sig { void } def initialize - @parents = T.let({}, T::Hash[T.untyped, T.untyped]) - @ranks = T.let({}, T::Hash[T.untyped, T.untyped]) + @parents = T.let({}, T::Hash[T.anything, T.untyped]) + @ranks = T.let({}, T::Hash[T.anything, T.untyped]) end - sig { params(item: T.untyped).returns(T::Boolean) } + sig { params(item: T.anything).returns(T::Boolean) } def contains(item); end - sig { override.params(block: T.proc.params(arg0: Elem).returns(BasicObject)).returns(T.untyped) } + sig { override.params(block: T.nilable(T.proc.params(arg0: Enumerable::Elem).returns(BasicObject))).returns(T.any(T::Hash[T.untyped, T.untyped], T::Enumerator[T.untyped])) } def each(&block); end sig { returns(Integer) } @@ -224,7 +224,7 @@ module PDF sig { params(x: T.untyped).returns(PDF::Reader::DisjointSet) } def add(x); end - sig { params(x: T.proc.returns(T.type_parameter(:U))).returns(T.any(Elem, T.type_parameter(:U))) } + sig { type_parameters(:U).params(x: T.type_parameter(:U)).returns(T.type_parameter(:U)) } def find(x); end sig { returns(T::Array[T.untyped]) } @@ -1231,11 +1231,14 @@ module PDF end class Paragraph + sig { returns(String) } + attr_reader :text + + sig { returns(PDF::Reader::Point) } + attr_reader :origin + sig { params(text: String, origin: PDF::Reader::Point).void } - def initialize(text, origin) - @text = T.let(T.unsafe(nil), String) - @origin = T.let(T.unsafe(nil), PDF::Reader::Point) - end + def initialize(text, origin); end end class Parser From dde1100f2c72047929a40a2e19b85a69b0929210 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Mon, 30 Oct 2023 12:42:56 -0400 Subject: [PATCH 14/19] Typecheck horizontal_overlap --- rbi/pdf-reader.rbi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rbi/pdf-reader.rbi b/rbi/pdf-reader.rbi index f97bf4a5..9b246721 100644 --- a/rbi/pdf-reader.rbi +++ b/rbi/pdf-reader.rbi @@ -1620,6 +1620,9 @@ module PDF sig { params(other_run: T.untyped).returns(Numeric) } def intersection_area_percent(other_run); end + sig { params(other_run: T.untyped).returns(Numeric) } + def horizontal_overlap(other_run); end + sig { returns(Numeric) } def area; end From 8e3913700c8c6354e816304a4ba6b39a87ac0e25 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Mon, 30 Oct 2023 15:05:47 -0400 Subject: [PATCH 15/19] Refactor, more type fixes --- lib/pdf/reader/disjoint_set.rb | 8 +++----- rbi/pdf-reader.rbi | 5 ++++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/pdf/reader/disjoint_set.rb b/lib/pdf/reader/disjoint_set.rb index ebda4a5c..ba0d8ea4 100644 --- a/lib/pdf/reader/disjoint_set.rb +++ b/lib/pdf/reader/disjoint_set.rb @@ -20,11 +20,9 @@ def contains(item) end def each(&block) - if block_given? - @parents.each_key(&block) - else - to_enum(:each) - end + return enum_for(:each) unless block_given? + + @parents.each_key(&block) end def length diff --git a/rbi/pdf-reader.rbi b/rbi/pdf-reader.rbi index 9b246721..7ad605c7 100644 --- a/rbi/pdf-reader.rbi +++ b/rbi/pdf-reader.rbi @@ -206,6 +206,8 @@ module PDF end class DisjointSet + include Enumerable + sig { void } def initialize @parents = T.let({}, T::Hash[T.anything, T.untyped]) @@ -215,7 +217,8 @@ module PDF sig { params(item: T.anything).returns(T::Boolean) } def contains(item); end - sig { override.params(block: T.nilable(T.proc.params(arg0: Enumerable::Elem).returns(BasicObject))).returns(T.any(T::Hash[T.untyped, T.untyped], T::Enumerator[T.untyped])) } + # sig { override.params(block: T.nilable(T.proc.params(arg0: Enumerable::Elem).returns(BasicObject))).returns(T.any(T::Hash[T.untyped, T.untyped], T::Enumerator[T.untyped])) } + sig { override.params(block: T.nilable).returns(T.any(T::Hash[T.untyped, T.untyped], T::Enumerator[T.untyped])) } def each(&block); end sig { returns(Integer) } From 7d7800648623dc733a8b507e4aa7bfa249b458b9 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Mon, 13 Nov 2023 16:53:36 -0500 Subject: [PATCH 16/19] Add disjoint set spec --- spec/disjoint_set_spec.rb | 97 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 spec/disjoint_set_spec.rb diff --git a/spec/disjoint_set_spec.rb b/spec/disjoint_set_spec.rb new file mode 100644 index 00000000..f2a320a0 --- /dev/null +++ b/spec/disjoint_set_spec.rb @@ -0,0 +1,97 @@ +# typed: false +# coding: utf-8 + +describe PDF::Reader::DisjointSet do + let(:set) { PDF::Reader::DisjointSet.new } + + describe "#add" do + it "adds a new item to the set" do + set.add(5) + expect(set.length).to eq(1) + expect(set.contains(5)).to be_truthy + end + end + + describe "#each" do + let(:set) do + set = PDF::Reader::DisjointSet.new + set.add(1) + set.add(2) + set.add(3) + set.union(1, 2) + end + + it "iterates over each item in the set (even if unions are created)" do + expect(set.each.to_a).to eq([1, 2, 3]) + end + + it "is used by Enumerable to provide iterative functionality like #map" do + result = set.map { |x| x.to_s } + expect(result).to eq(['1', '2', '3']) + end + end + + describe "#find" do + it "finds the parent of the item" do + set.add("parent") + set.add("child") + set.union("parent", "child") + expect(set.find("parent")).to eq("parent") + expect(set.find("child")).to eq("parent") + end + + it "returns the item if it is a parent" do + set.add("item") + expect(set.find("item")).to eq("item") + end + end + + describe "#sets" do + it "returns an array of arrays containing the sets" do + set.add("parent") + set.add("child") + set.add("unrelated") + set.union("parent", "child") + expect(set.sets).to eq([["parent", "child"], ["unrelated"]]) + end + end + + describe "#union" do + let(:set) do + set = PDF::Reader::DisjointSet.new + set.add("parent") + set.add("child") + set.add("grandchild") + set.add("unrelated") + end + + it "handles multiple unions" do + set.union("parent", "child") + set.union("child", "grandchild") + expect(set.sets).to eq([["parent", "child", "grandchild"], ["unrelated"]]) + end + + it "handles union params regardless of order" do + set.union("child", "parent") + set.union("grandchild", "child") + expect(set.sets).to eq([["parent", "child", "grandchild"], ["unrelated"]]) + end + + it "gracefully handles union of identical elements" do + set.union("child", "child") + expect(set.sets).to eq([["parent"], ["child"], ["grandchild"], ["unrelated"]]) + end + + it "handles joining multiple previous unions" do + set = PDF::Reader::DisjointSet.new + set.add("parent1") + set.add("child1") + set.add("parent2") + set.add("child2") + set.union("parent1", "child1") + set.union("parent2", "child2") + set.union("parent1", "parent2") + expect(set.sets).to eq([["parent1", "child1", "parent2", "child2"]]) + end + end +end From c26942c031857156004d00865555e63042cd7889 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Tue, 14 Nov 2023 10:51:45 -0500 Subject: [PATCH 17/19] Add missing comment on purpose of Paragraph class --- lib/pdf/reader/paragraph.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/pdf/reader/paragraph.rb b/lib/pdf/reader/paragraph.rb index acf4f7ce..c27aa250 100644 --- a/lib/pdf/reader/paragraph.rb +++ b/lib/pdf/reader/paragraph.rb @@ -4,6 +4,8 @@ module PDF class Reader + + # A simple class used by PDF::Reader::Page.paragraphs to represent a paragraph of text and its origin. class Paragraph attr_reader :text, :origin From 97344846f5a12d1b08cfca5f061900a2e72a3557 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Tue, 14 Nov 2023 10:52:03 -0500 Subject: [PATCH 18/19] Fix Elem type in DisjointSet --- rbi/pdf-reader.rbi | 1 + 1 file changed, 1 insertion(+) diff --git a/rbi/pdf-reader.rbi b/rbi/pdf-reader.rbi index 7ad605c7..b14bf9c2 100644 --- a/rbi/pdf-reader.rbi +++ b/rbi/pdf-reader.rbi @@ -207,6 +207,7 @@ module PDF class DisjointSet include Enumerable + Elem = type_member { {fixed: T.untyped} } sig { void } def initialize From 8c7e965c1b9121ddd17717d43352d3d1c4109436 Mon Sep 17 00:00:00 2001 From: Clinton Judy Date: Tue, 14 Nov 2023 10:54:17 -0500 Subject: [PATCH 19/19] Clean up old comment --- rbi/pdf-reader.rbi | 1 - 1 file changed, 1 deletion(-) diff --git a/rbi/pdf-reader.rbi b/rbi/pdf-reader.rbi index b14bf9c2..b0fa7356 100644 --- a/rbi/pdf-reader.rbi +++ b/rbi/pdf-reader.rbi @@ -218,7 +218,6 @@ module PDF sig { params(item: T.anything).returns(T::Boolean) } def contains(item); end - # sig { override.params(block: T.nilable(T.proc.params(arg0: Enumerable::Elem).returns(BasicObject))).returns(T.any(T::Hash[T.untyped, T.untyped], T::Enumerator[T.untyped])) } sig { override.params(block: T.nilable).returns(T.any(T::Hash[T.untyped, T.untyped], T::Enumerator[T.untyped])) } def each(&block); end