Reimplementing wc: Where Text Abstractions Leak
Reimplementing wc is less about counting and more about confronting the places where “text” stops being a clean abstraction. The interface is trivial; the semantics are not. Most of the interesting behavior lives at the boundary between bytes, encodings, and Unix I/O conventions. Below is a minimal clone. It’s deliberately scoped: correct along a few dimensions, incomplete along others. #!/usr/bin/env python3 import argparse, sys def get_bytes(raw): return len(raw) def get_lines(text): return text.count('\n') def get_words(text): return len(text.split()) def get_chars(text): return len(text) def main(): parser = argparse.ArgumentParser() parser.add_argument('file_path', nargs='?') parser.add_argument('-c', action="store_true") parser.add_argument('-l', action="store_true") parser.add_argument('-w', action="store_true") parser.add_argument('-m', action="store_true") args = parser.parse_args() if args.file_path: with open(args.file_path, "rb") as f: raw = f.read() label = args.file_path elif not sys.stdin.isatty(): raw = sys.stdin.buffer.read() label = None else: return text = raw.decode('utf-8') if not any([args.c, args.l, args.w, args.m]): args.c = args.l = args.w = True results = [] if args.l: results.append(str(get_lines(text))) if args.w: results.append(str(get_words(text))) if args.c: results.append(str(get_bytes(raw))) if args.m: results.append(str(get_chars(text))) parts = ' '.join(results) print(f"{parts} {label}" if label else parts) if __name__ == "__main__": main() Bytes vs characters is an interface boundary, not trivia The -c / -m distinction is where most implementations quietly diverge from spec. ...