diff --git a/README.md b/README.md index f341cf7..3446c5d 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ The loosely specified `diff` file format is widely used for transmitting differences of line-based information. The motivating example is [`opam`](https://opam.ocaml.org), which is able to validate updates being -cryptographically signed (e.g. [conex](https://github.com/hannesm/conex)) by +cryptographically signed (e.g. [conex](https://github.com/robur-coop/conex)) by providing a unified diff. The [test-based infered specification](https://www.artima.com/weblogs/viewpost.jsp?thread=164293) @@ -50,11 +50,16 @@ or deleted, and if the chunk size is omitted (including the comma), it is set to 1. NB from practical experiments, only "+1" and "-1" are supported. ```OCaml +type git_ext = + | Rename_only of string * string + | Delete_only + | Create_only + type operation = | Edit of string * string | Delete of string | Create of string - | Rename_only of string * string + | Git_ext of (string * string * git_ext) type hunk (* positions and contents *) @@ -73,8 +78,7 @@ from old and new file contents is also provided. The function `patch` assumes that the patch applies cleanly, and does not check this assumption. Exceptions may be raised if this assumption is violated. -The git diff format allows further features, such as file permissions, and also -a "copy from / to" header, which I was unable to spot in the wild. +The git diff format allows further features, such as file permissions. ## Installation diff --git a/src/dune b/src/dune index 73c8ae5..eb1c38c 100644 --- a/src/dune +++ b/src/dune @@ -2,7 +2,7 @@ (name patch) (synopsis "Patch purely in OCaml") (public_name patch) - (modules patch lib fname)) + (modules patch lib fname rope)) (executable (name patch_command) diff --git a/src/lib.ml b/src/lib.ml index e9f5b7e..95b496f 100644 --- a/src/lib.ml +++ b/src/lib.ml @@ -57,12 +57,4 @@ module List = struct | [] -> invalid_arg "List.last" | [x] -> x | _::xs -> last xs - - let rev_cut idx l = - let rec aux acc idx = function - | l when idx = 0 -> (acc, l) - | [] -> invalid_arg "List.cut" - | x::xs -> aux (x :: acc) (idx - 1) xs - in - aux [] idx l end diff --git a/src/lib.mli b/src/lib.mli index 81da0d8..fe6d438 100644 --- a/src/lib.mli +++ b/src/lib.mli @@ -9,5 +9,4 @@ end module List : sig val last : 'a list -> 'a - val rev_cut : int -> 'a list -> 'a list * 'a list end diff --git a/src/patch.ml b/src/patch.ml index 4388c64..e5c66e2 100644 --- a/src/patch.ml +++ b/src/patch.ml @@ -36,27 +36,30 @@ let pp_hunk ~mine_no_nl ~their_no_nl ppf hunk = hunk.mine_start hunk.mine_len hunk.their_start hunk.their_len (unified_diff ~mine_no_nl ~their_no_nl hunk) -let rec apply_hunk ~cleanly ~fuzz (last_matched_line, offset, lines) ({mine_start; mine_len; mine; their_start = _; their_len; their} as hunk) = +let rec apply_hunk ~cleanly ~fuzz (last_matched_line, offset, rope) ({mine_start; mine_len; mine; their_start = _; their_len; their} as hunk) = let mine_start = mine_start + offset in let patch_match ~search_offset = let mine_start = mine_start + search_offset in - let rev_prefix, rest = Lib.List.rev_cut (Stdlib.max 0 (mine_start - 1)) lines in - let rev_actual_mine, suffix = Lib.List.rev_cut mine_len rest in - let actual_mine = List.rev rev_actual_mine in - if actual_mine <> (mine : string list) then - invalid_arg "unequal mine"; - (* TODO: should we check their_len against List.length their? *) + let off_mine = Stdlib.max 0 (mine_start - 1) in + let prefix = Rope.chop rope off_mine in + let actual_mine = Rope.chop rope ~off:off_mine mine_len in + let off = off_mine + mine_len in + let suffix = Rope.shift rope off in + if not (Rope.equal_to_string_list actual_mine mine) then + invalid_arg "unequal mine"; + let theirs = + let nl = Rope.last_is_nl actual_mine in + Rope.of_strings their nl + in (mine_start + mine_len, offset + (their_len - mine_len), - (* TODO: Replace rev_append (rev ...) by the tail-rec when patch - requires OCaml >= 4.14 *) - List.rev_append rev_prefix (List.rev_append (List.rev their) suffix)) + Rope.concat prefix (Rope.concat theirs suffix)) in try patch_match ~search_offset:0 with Invalid_argument _ -> if cleanly then invalid_arg "apply_hunk" else - let max_pos_offset = Stdlib.max 0 (List.length lines - Stdlib.max 0 (mine_start - 1) - mine_len) in + let max_pos_offset = Stdlib.max 0 (Rope.length rope - Stdlib.max 0 (mine_start - 1) - mine_len) in let max_neg_offset = mine_start - last_matched_line in let rec locate search_offset = let aux search_offset max_offset = @@ -100,7 +103,7 @@ let rec apply_hunk ~cleanly ~fuzz (last_matched_line, offset, lines) ({mine_star else if mine_len = (hunk.mine_len : int) && their_len = (hunk.their_len : int) then invalid_arg "apply_hunk: could not apply fuzz" else - apply_hunk ~cleanly ~fuzz:(fuzz + 1) (last_matched_line, offset, lines) hunk + apply_hunk ~cleanly ~fuzz:(fuzz + 1) (last_matched_line, offset, rope) hunk else invalid_arg "apply_hunk" else @@ -476,9 +479,9 @@ let patch ~cleanly filedata diff = | _ -> assert false end | Edit _ -> - let old = match filedata with None -> [] | Some x -> to_lines x in - let _, _, lines = List.fold_left (apply_hunk ~cleanly ~fuzz:0) (0, 0, old) diff.hunks in - let lines = String.concat "\n" lines in + let old = match filedata with None -> Rope.empty | Some x -> Rope.of_string x in + let _, _, rope = List.fold_left (apply_hunk ~cleanly ~fuzz:0) (0, 0, old) diff.hunks in + let lines = Rope.to_string rope in let lines = match diff.mine_no_nl, diff.their_no_nl with | false, true -> diff --git a/src/rope.ml b/src/rope.ml new file mode 100644 index 0000000..8e4cfdc --- /dev/null +++ b/src/rope.ml @@ -0,0 +1,117 @@ +(* Originally from https://github.com/robur-coop/utcp, written by + Calascibetta Romain *) + +(* A rope data structure where each node is a line *) + +type t = + | Str of string array * bool * int * int + | App of t * t * int + +let length = function + | Str (_, _, len, _) -> len + | App (_, _, len) -> len + +(* keep compatibility with 4.08 *) +let min_int (a : int) (b : int) = min a b +external unsafe_blit_string : string -> int -> bytes -> int -> int -> unit + = "caml_blit_string" [@@noalloc] + +let append t1 t2 = + App (t1, t2, length t1 + length t2) + +let empty = Str (Array.make 0 "", true, 0, 0) + +let rec unsafe_sub t start stop = + if start = 0 && Int.equal stop (length t) then + t + else if Int.equal start stop then + empty + else match t with + | Str (data, nl, len, off) -> + assert (stop <= (len : int)); + Str (data, nl, stop - start, off + start) + | App (l, r, _) -> + let len = length l in + if stop <= (len : int) then unsafe_sub l start stop + else if start >= (len : int) then unsafe_sub r (start - len) (stop - len) + else append (unsafe_sub l start len) (unsafe_sub r 0 (stop - len)) + +let chop t ?(off = 0) len = + if len < 0 || len > (length t - off : int) then + invalid_arg "Rope.chop"; + if len = 0 then empty else unsafe_sub t off (off + len) + +let shift t len = + if len < 0 then + invalid_arg "Rope.shift"; + if len = 0 then + t + else + let max = length t in + let len = min_int max len in + let l = len + (max - len) in + unsafe_sub t len l + +let rec last_is_nl = function + | Str (a, nl, len, off) -> if Int.equal (Array.length a - off) len then nl else true + | App (_, r, _) -> last_is_nl r + +let rec byte_length = function + | Str (s, _, len, off) as a -> + let sum = ref 0 in + for idx = off to len + off - 1 do + let data = Array.unsafe_get s idx in + sum := !sum + String.length data + 1 + done; + !sum - if last_is_nl a then 0 else 1 + | App (l, r, _) -> byte_length l + byte_length r + +let rec into_bytes buf dst_off = function + | Str (s, _, len, off) as a -> + let off' = ref dst_off in + for idx = off to len + off - 1 do + let data = Array.unsafe_get s idx in + unsafe_blit_string data 0 buf !off' (String.length data); + off' := !off' + String.length data + 1; + if idx - off < (len - 1 : int) || (Int.equal (idx - off) (len - 1) && last_is_nl a) then + Bytes.unsafe_set buf (!off' - 1) '\n' + done + | App (l, r, _) -> + into_bytes buf dst_off l; + into_bytes buf (dst_off + byte_length l) r + +let to_string t = + let len = byte_length t in + let buf = Bytes.create len in + into_bytes buf 0 t; + Bytes.unsafe_to_string buf + +let concat a b = append a b + +let of_strings xs last_is_nl = + let d = Array.of_list xs in + Str (d, last_is_nl, Array.length d, 0) + +let of_string str = + let splitted = String.split_on_char '\n' str in + let last_is_nl = String.unsafe_get str (String.length str - 1) = '\n' in + let d = Array.of_list splitted in + Str (d, last_is_nl, Array.length d - (if last_is_nl then 1 else 0), 0) + +let rec equal_to_string_list t = function + | [] -> length t = 0 + | hd :: tl -> + let rec find_data = function + | Str (data, _, len, off) -> + if len > 0 then Some (Array.get data off) else None + | App (l, r, _) -> + if length l > 0 then + find_data l + else + find_data r + in + match find_data t with + | None -> false + | Some data -> + String.equal hd data && + equal_to_string_list (shift t 1) tl diff --git a/src/rope.mli b/src/rope.mli new file mode 100644 index 0000000..e34c281 --- /dev/null +++ b/src/rope.mli @@ -0,0 +1,36 @@ +type t (** The type for a rope data structure *) + +val length : t -> int +(** [length t] returns the amount of strings in [t]. *) + +val empty : t +(** [empty] is the empty rope. *) + +val of_strings : string list -> bool -> t +(** [of_strings xs nl] is a rope [t] which contains the strings of [xs]. If + [nl] is true, the last string will have a newline, otherwise not. *) + +val of_string : string -> t +(** [of_string str] will split the string [str] on newline, and return a rope. *) + +val to_string : t -> string +(** [to_string t] is the string where the contents of [t] is present. *) + +val chop : t -> ?off:int -> int -> t +(** [chop t ~off len] returns a new rope that contains [len] strings starting + at [off] of the provided rope [t]. Raises Invalid_argument if [len] and + [off] are not inside the bounds. *) + +val shift : t -> int -> t +(** [shift t len] returns a new rope that does not contain the first [len] + strings, but only the remaining strings of [t]. *) + +val concat : t -> t -> t +(** [concat t t'] returns a new rope which contains [t] followed by [t']. *) + +val last_is_nl : t -> bool +(** [last_is_nl t] returns [true] if the last string should have a newline. *) + +val equal_to_string_list : t -> string list -> bool +(** [equal_to_string_list t xs] returns [true] if the content of [t] is equal to + the content of [xs]. *) diff --git a/test/data/external b/test/data/external index 6838772..495bc17 160000 --- a/test/data/external +++ b/test/data/external @@ -1 +1 @@ -Subproject commit 6838772a456e65ecbea999c27d5ca7c794026566 +Subproject commit 495bc17881d4e8f7df1645dea4d92df8cc7bd79f diff --git a/test/test.ml b/test/test.ml index b392067..e566112 100644 --- a/test/test.ml +++ b/test/test.ml @@ -1101,12 +1101,24 @@ let one_mil_apply () = Alcotest.(check string) __LOC__ expected (Option.get actual) | None, _, _ | _, None, _ | _, _, None -> Alcotest.skip () +let many_hunks_old = lazy (opt_read "./external/many-hunks.old") +let many_hunks_new = lazy (opt_read "./external/many-hunks.new") +let many_hunks_diff = lazy (opt_read "./external/many-hunks.diff") +let many_hunks_apply () = + match Lazy.force many_hunks_old, Lazy.force many_hunks_new, Lazy.force many_hunks_diff with + | Some many_hunks_old, Some expected, Some diff -> + let patch = Patch.parse ~p:0 diff in + let actual = Patch.patch ~cleanly:true (Some many_hunks_old) (List.hd patch) in + Alcotest.(check string) __LOC__ expected (Option.get actual) + | None, _, _ | _, None, _ | _, _, None -> Alcotest.skip () + let big_diff = [ "parse", `Quick, parse_big; "print", `Quick, print_big; "parse own", `Quick, parse_own; "1_000_000 print", `Quick, one_mil_print; "1_000_000 apply", `Quick, one_mil_apply; + "many-hunks apply", `Quick, many_hunks_apply; ] let print_diff_mine_empty_their_no_nl () =