diff --git a/csvkit/utilities/csvdiff.py b/csvkit/utilities/csvdiff.py new file mode 100644 index 0000000000000000000000000000000000000000..4b5fefb450b48cf2d8c3fd2999475e27ad354867 --- /dev/null +++ b/csvkit/utilities/csvdiff.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python + +import sys +import daff + +from csvkit import CSVKitReader, CSVKitWriter +from csvkit.cli import CSVKitUtility, match_column_identifier + +class CSVDiff(CSVKitUtility): + description = 'Compare CSV files. Like unix "diff" command, but for tabular data.' + epilog = 'Note that the diff operation requires reading all files into memory. Don\'t try this on very large files.' + override_flags = ['f', 'H'] + + def add_arguments(self): + self.argparser.add_argument(metavar="FILE", nargs='*', dest='input_paths', default=['-'], + help='The CSV files to operate on.') + self.argparser.add_argument('-c', '--columns', dest='columns', + help='The column name(s) to use for comparison. Should be either one name or a comma-separated list of names. May also be left unspecified, in which case we\'ll guess something plausible.') + self.argparser.add_argument('--color', dest='color', action='store_true', + help='Decorate output with colors and glyphs.') + + def main(self): + self.input_files = [] + + for path in self.args.input_paths: + self.input_files.append(self._open_input_file(path)) + + ct = len(self.input_files) + if ct < 2: + self.argparser.error('You must specify two or three files to compare.\n(If three, the first file should be a common ancestor of the remaining two)') + if ct > 3: + self.argparser.error('You must specify at most three files to compare.') + + match_column_names = [] + if self.args.columns: + match_column_names = self._parse_match_column_names(self.args.columns) + + tables = [] + + for f in self.input_files: + tables.append(list(CSVKitReader(f, **self.reader_kwargs))) + f.close() + + flags = daff.CompareFlags() + for c in match_column_names: + flags.addPrimaryKey(c) + + result = [] + tab = daff.PythonTableView(result) + tab1 = daff.PythonTableView(tables[ct-2]) + tab2 = daff.PythonTableView(tables[ct-1]) + if ct == 3: + tab0 = daff.PythonTableView(tables[0]) + alignment = daff.Coopy.compareTables3(tab0,tab1,tab2,flags).align() + else: + alignment = daff.Coopy.compareTables(tab1,tab2,flags).align() + daff.TableDiff(alignment,flags).hilite(tab) + + if self.args.color or (self.output_file == sys.stdout and sys.stdout.isatty()): + self.output_file.write(daff.TerminalDiffRender().render(tab).encode('utf-8')) + else: + output = CSVKitWriter(self.output_file, **self.writer_kwargs) + for row in result: + output.writerow(row) + + def _parse_match_column_names(self, join_string): + """ + Parse a list of match columns. + """ + return list(map(str.strip, join_string.split(','))) + + +def launch_new_instance(): + utility = CSVDiff() + utility.main() + +if __name__ == "__main__": + launch_new_instance() diff --git a/docs/cli.rst b/docs/cli.rst index cf9dca80f0c34a48589352ebc896cc21a7f6bc18..aaf9903b1005b8d9bed945bb213ce276c81f8d3a 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -32,6 +32,7 @@ Output (and Analysis) .. toctree:: :maxdepth: 1 + scripts/csvdiff scripts/csvformat scripts/csvjson scripts/csvlook diff --git a/docs/scripts/csvdiff.rst b/docs/scripts/csvdiff.rst new file mode 100644 index 0000000000000000000000000000000000000000..2c5b253be0baea5e32bb75debea6023b5becdcac --- /dev/null +++ b/docs/scripts/csvdiff.rst @@ -0,0 +1,58 @@ +======= +csvdiff +======= + +Description +=========== + +Compare two CSV tables, and produce another CSV table that summarizes their differences.:: + + usage: csvdiff [-h] [-d DELIMITER] [-t] [-q QUOTECHAR] [-u {0,1,2,3}] [-b] + [-p ESCAPECHAR] [-z MAXFIELDSIZE] [-e ENCODING] [-S] [-v] [-l] + [--zero] [-c COLUMNS] [--color] + [FILE [FILE ...]] + + Compare CSV files. Like unix "diff" command, but for tabular data. + + positional arguments: + FILE The CSV files to operate on. + + optional arguments: + -h, --help show this help message and exit + -c COLUMNS, --columns COLUMNS + The column name(s) to use for comparison. Should be + either one name or a comma-separated list of names. + May also be left unspecified, in which case we'll + guess something plausible. + --color Decorate output with colors and glyphs. + + Note that the diff operation requires reading all files into memory. Don't try + this on very large files. + + +See also: :doc:`../common_arguments`. + +Examples +======== + +:: + + csvdiff file1.csv file2.csv + +This command says you have two files to compare, file1.csv and file2.csv. +A typical output will look like this: + +.. raw:: html + + <p style="font-family: monospace;"> + <span style="font-weight:bold">@@</span> ,<span style="font-weight:bold">bridge</span> ,<span style="font-weight:bold">designer</span> ,<span style="font-weight:bold">length</span><br/> + ,Brooklyn ,"J. A. Roebling" ,1595<br/> + <span style="color:green"></span><span style="color:green;font-weight:bold">+++</span>,<span style="color:green"></span><span style="color:green;font-weight:bold">Manhattan</span> ,"<span style="color:green"></span><span style="color:green;font-weight:bold">G. Lindenthal</span>" ,<span style="color:green"></span><span style="color:green;font-weight:bold">1470</span><br/> + <span style="color:red"></span><span style="color:red;font-weight:bold"></span><span style="color:blue;font-weight:bold"></span><span style="color:blue;font-weight:bold">→</span><span style="color:green;font-weight:bold"></span><span style="color:green;font-weight:bold"></span> ,Williamsburg ,"<span style="color:red"></span><span style="color:red;font-weight:bold">D. Duck</span><span style="color:blue;font-weight:bold"></span><span style="color:blue;font-weight:bold">→</span><span style="color:green;font-weight:bold"></span><span style="color:green;font-weight:bold">L. L. Buck</span>" ,1600<br/> + ,Queensborough ,"Palmer & Hornbostel",1182<br/> + ...,... ,... ,...<br/> + ,"George Washington","O. H. Ammann" ,3500<br/> + <span style="color:red"></span><span style="color:red;font-weight:bold">---</span>,<span style="color:red"></span><span style="color:red;font-weight:bold">Spamspan</span> ,"<span style="color:red"></span><span style="color:red;font-weight:bold">S. Spamington</span>" ,<span style="color:red"></span><span style="color:red;font-weight:bold">10000</span> + </p> + +See http://dataprotocols.org/tabular-diff-format/ for information on the diff format. Color highlighting will by default only activate when the result is being shown on a console. diff --git a/requirements-py2.txt b/requirements-py2.txt index f3fad914d193f598252bfd7444f350ffa63fff00..2bcec1cca8d7397d0abeb94e739d42c6b3e81621 100644 --- a/requirements-py2.txt +++ b/requirements-py2.txt @@ -13,3 +13,4 @@ six>=1.6.1 ordereddict>=1.1 simplejson>=3.6.3 sphinx_rtd_theme +daff diff --git a/requirements-py3.txt b/requirements-py3.txt index 3c26240ef0cb97b6290b1f6c8d93d9c45c7d7939..43985c53a134d7c875e3e6df98cf8a346ea0dd69 100644 --- a/requirements-py3.txt +++ b/requirements-py3.txt @@ -8,3 +8,4 @@ openpyxl>=2.0.3 tox>=1.3 six>=1.6.1 sphinx_rtd_theme +daff diff --git a/setup.py b/setup.py index 04ae53f8834854550c6ed16dd04d314f0538fc42..dc3b7015852ef8ebf2d9f0f247b5012c2f670973 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ setup( 'console_scripts': [ 'csvclean = csvkit.utilities.csvclean:launch_new_instance', 'csvcut = csvkit.utilities.csvcut:launch_new_instance', + 'csvdiff = csvkit.utilities.csvdiff:launch_new_instance', 'csvformat = csvkit.utilities.csvformat:launch_new_instance', 'csvgrep = csvkit.utilities.csvgrep:launch_new_instance', 'csvjoin = csvkit.utilities.csvjoin:launch_new_instance', diff --git a/tests/test_utilities/test_csvdiff.py b/tests/test_utilities/test_csvdiff.py new file mode 100644 index 0000000000000000000000000000000000000000..997a60605364d116b169ba90cd91d90dc8e0bf1a --- /dev/null +++ b/tests/test_utilities/test_csvdiff.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python + +import six + +try: + import unittest2 as unittest +except ImportError: + import unittest + +from csvkit.utilities.csvdiff import CSVDiff + +class TestCSVJoin(unittest.TestCase): + def test_basic(self): + args = ['examples/join_a.csv', 'examples/join_b.csv'] + output_file = six.StringIO() + + utility = CSVDiff(args, output_file) + utility.main() + + output = six.StringIO(output_file.getvalue()) + + self.assertEqual(len(output.readlines()), 6)