find-german-comments 14.4 KB
Newer Older
1
#!/usr/bin/env python3
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
########################################################################
#
#  Copyright (c) 2010 Jonas Jensen, Miklos Vajna
#
#  Permission is hereby granted, free of charge, to any person
#  obtaining a copy of this software and associated documentation
#  files (the "Software"), to deal in the Software without
#  restriction, including without limitation the rights to use,
#  copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following
#  conditions:
#
#  The above copyright notice and this permission notice shall be
#  included in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
#  OTHER DEALINGS IN THE SOFTWARE.
#
########################################################################


30 31 32 33 34 35
import sys
import re
import subprocess
import os
import argparse
import string
36 37 38 39

class Parser:
    """
    This parser extracts comments from source files, tries to guess
Andrea Gelmini's avatar
Andrea Gelmini committed
40
    their language and then prints out the German ones.
41 42 43
    """
    def __init__(self):
        self.strip = string.punctuation + " \n"
44
        self.text_cat = self.start_text_cat()
Andrea Gelmini's avatar
Andrea Gelmini committed
45
        parser = argparse.ArgumentParser(description='Searches for German comments in cxx/hxx source files inside a given root directory recursively.')
46
        parser.add_argument("-f", "--filenames-only", action="store_true",
47
            help="Only print the filenames of files containing German comments")
48
        parser.add_argument("-v", "--verbose", action="store_true",
49
            help="Turn on verbose mode (print only positives progress to stderr)")
50
        parser.add_argument("-l", "--line-numbers", action="store_true",
51
            help="Prints the filenames and line numbers only.")
52
        parser.add_argument("-L", "--line-numbers-pos", action="store_true",
53
            help="Prints the filenames and line numbers only (if positive).")
54
        parser.add_argument("-t", "--threshold", action="store", default=0, type=int,
55
            help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
56 57 58
        parser.add_argument("directory", nargs='?', default='.', type=str, help='Give a directory to search in')
        self.args = parser.parse_args()
        self.check_source_files(self.args.directory)
59 60 61 62 63 64

    def get_comments(self, filename):
        """
        Extracts the source code comments.
        """
        linenum = 0
65 66
        if self.args.verbose:
            print("processing file '%s'...\n" % filename)
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
        sock = open(filename)
        # add an empty line to trigger the output of collected oneliner
        # comment group
        lines = sock.readlines() + ["\n"]
        sock.close()

        in_comment = False
        buf = []
        count = 1
        for i in lines:
            if "//" in i and not in_comment:
                # if we find a new //-style comment, then we
                # just append it to a previous one if: there is
                # only whitespace before the // mark that is
                # necessary to make comments longer, giving
                # more reliable output
                if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
                    s = re.sub(".*// ?", "", i).strip(self.strip)
                    if len(s):
                        buf.append(s)
                else:
                    # otherwise it's an independent //-style comment in the next line
                    yield (count, "\n    ".join(buf))
                    buf = [re.sub(".*// ?", "", i.strip(self.strip))]
            elif "//" not in i and not in_comment and len(buf) > 0:
                # first normal line after a // block
                yield (count, "\n    ".join(buf))
                buf = []
            elif "/*" in i and "*/" not in i and not in_comment:
                # start of a real multiline comment
                in_comment = True
                linenum = count
                s = re.sub(".*/\*+", "", i.strip(self.strip))
                if len(s):
                    buf.append(s.strip(self.strip))
            elif in_comment and not "*/" in i:
                # in multiline comment
                s = re.sub("^( |\|)*\*?", "", i)
                if len(s.strip(self.strip)):
                    buf.append(s.strip(self.strip))
            elif "*/" in i and in_comment:
                # end of multiline comment
                in_comment = False
                s = re.sub(r"\*+/.*", "", i.strip(self.strip))
                if len(s):
                    buf.append(s)
                yield (count, "\n    ".join(buf))
                buf = []
            elif "/*" in i and "*/" in i:
                # c-style oneliner comment
                yield (count, re.sub(".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
            count += 1

120
    def start_text_cat(self):
121 122 123
        cwd = os.getcwd()
        # change to our directory
        os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
124
        sock = subprocess.Popen(["text_cat/text_cat", "-s", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
125
        os.chdir(cwd)
126 127 128
        return sock

    def get_lang(self, s):
Andrea Gelmini's avatar
Andrea Gelmini committed
129
        """ the output is 'german' or 'english' or 'german or english'. When
130
        unsure, just don't warn, there are strings where you just can't
Andrea Gelmini's avatar
Andrea Gelmini committed
131
        determine the results reliably, like '#110680#' """
132

133 134
        self.text_cat.stdin.write(bytes(s, 'utf-8'))
        self.text_cat.stdin.write(bytes("\n", 'utf-8'))
135 136
        self.text_cat.stdin.flush()
        lang = self.text_cat.stdout.readline().strip()
137 138 139 140
        return lang

    def is_german(self, s):
        """
Andrea Gelmini's avatar
Andrea Gelmini committed
141
        determines if a string is German or not
142 143 144 145 146 147
        """
        # for short strings we can't do reliable recognition, so skip
        # short strings and less than 4 words
        s = s.replace('\n', ' ')
        if len(s) < 32 or len(s.split()) < 4:
            return False
148
        return self.get_lang(s) == b"german"
149 150 151 152 153

    def check_file(self, path):
        """
        checks each comment in a file
        """
154
        def tab_calc(path):
155
            START = 40 #Default of 10 tabs
156
            if len(path) >= START:
157
                return 1
158
            diff = START - len(path)
159 160 161 162 163 164
            if diff % 4 is not 0:
                padding = 1
            else:
                padding = 0
            return (diff/4)+padding

165
        if self.args.line_numbers or self.args.line_numbers_pos:
166 167 168 169 170
            TABS = "\t"*10
            path_linenums = []
            for linenum, s in self.get_comments(path):
                if self.is_german(s):
                    path_linenums.append(linenum)
171 172 173
            valid = len(path_linenums) > int(self.args.threshold)
            if self.args.line_numbers:
                print("%s ... %s positives -- %s\n" % (path, str(len(path_linenums)), str(valid)))
174
            if valid:
175 176
                if self.args.line_numbers_pos:
                    print("%s ... %s positives\n" % (path, str(len(path_linenums))))
177
                    return
178
                if len(path) + (len(path_linenums)*4) > 75:
179
                    print("%s:\n" % path)
180
                    while path_linenums:
181 182 183 184 185 186 187 188
                        i = 0
                        numline = []
                        while i < 10:
                            try:
                                numline.append(path_linenums[0])
                                path_linenums.remove(path_linenums[0])
                            except IndexError:
                                i = 10
189
                            i += 1
190
                        numline = [str(i) for i in numline]
191
                        print("%s%s" % (TABS, ",".join(numline)))
192
                else:
193
                    if self.args.line_numbers:
194
                        path_linenums = [str(i) for i in path_linenums]
195
                        print("%s:%s%s" % (path, "\t"*int(tab_calc(path)), ",".join(path_linenums)))
196

197
        elif not self.args.filenames_only:
198
            for linenum, s in self.get_comments(path):
199
                if self.is_german(s):
200
                    print("%s:%s: %s" % (path, linenum, s))
201 202 203 204 205 206 207 208
        else:
            fnames = set([])
            for linenum, s in self.get_comments(path):
                if self.is_german(s):
                    # Make sure we print each filename only once
                    fnames.add(path)
            # Print the filenames
            for f in fnames:
209
                print(f)
210

211
    def first_elem(self, path):
212 213 214
        """
        Returns the root directory in our repo of a given path, so we can check against the whitelist.
        """
215
        lastElem = os.path.dirname(path)
216 217 218 219 220 221 222 223 224
        done = False
        while not done:
            nextElem = os.path.split(lastElem)[0]
            if nextElem is not '':
                lastElem = nextElem
            else:
                done = True
        return lastElem

225
    def check_source_files(self, directory):
226 227 228
        """
        checks each _tracked_ file in a directory recursively
        """
229 230

	# top-level project directory -> use whitelist.
231
        globalscan = False
232 233
        if os.path.exists(directory + "/.git/config"):
           globalscan = True
234 235

        # Change into the given dir, so "git ls-tree" does work.
236
        os.chdir(directory)
237 238

        sock = os.popen(r"git ls-tree -r HEAD --name-only |egrep '\.(c|cc|cpp|cxx|h|hxx|mm)$'")
239 240
        lines = sock.readlines()
        sock.close()
241 242 243

        # Helps to speedup a global scan
        directory_whitelist = {
244 245 246 247
            "ure" : 1,
            "ios" : 1,
            "bean" : 1,
            "apple_remote" : 1,
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
            "UnoControls" : 1,
            "accessibility" : 1,
            "android" : 1,
            "animations" : 1,
            "avmedia" : 1,
            "basctl" : 1,
            "basegfx" : 1,
            "basic" : 1,
            "binaryurp" : 1,
            "bridges" : 1,
            "canvas" : 1,
            "chart2" : 1,
            "cli_ure" : 1,
            "codemaker" : 1,
            "comphelper" : 1,
            "compilerplugins" : 1,
            "configmgr" : 1,
            "connectivity" : 1,
            "cppcanvas" : 1,
            "cppu" : 1,
            "cppuhelper" : 1,
            "cpputools" : 1,
            "cui" : 1,
271
            "dbaccess" : 1,
272 273 274 275 276 277 278 279 280 281
            "desktop" : 1,
            "drawinglayer" : 1,
            "dtrans" : 1,
            "editeng" : 1,
            "embeddedobj" : 1,
            "embedserv" : 1,
            "eventattacher" : 1,
            "extensions" : 1,
            "external" : 1,
            "filter" : 1,
282
            "forms" : 1,
283 284 285 286 287
            "formula" : 1,
            "fpicker" : 1,
            "framework" : 1,
            "helpcompiler" : 1,
            "hwpfilter" : 1,
288
            "i18npool" : 1,
289 290 291 292
            "i18nlangtag" : 1,
            "i18nutil" : 1,
            "idl" : 1,
            "idlc" : 1,
293
            "include" : 1,
294 295 296 297
            "io" : 1,
            "javaunohelper" : 1,
            "jvmaccess" : 1,
            "jvmfwk" : 1,
298
            "jurt" : 1,
299
            "l10ntools" : 1,
300
            "libreofficekit" : 1,
301 302 303
            "lingucomponent" : 1,
            "linguistic" : 1,
            "lotuswordpro" : 1,
304
            "mysqlc" : 1,
305 306 307
            "o3tl" : 1,
            "odk" : 1,
            "officecfg" : 1,
308 309
            "onlineupdate" : 1,
            "opencl" : 1,
310 311 312 313
            "oox" : 1,
            "package" : 1,
            "postprocess" : 1,
            "pyuno" : 1,
314
            "registry" : 1,
315
            "remotebridges" : 1,
316
            "reportdesign" : 1,
317
            "rsc" : 1,
318 319 320
            "sal" : 1,
            "salhelper" : 1,
            "sax" : 1,
321
            "sc" : 1,
322
            "scaddins" : 1,
323 324
            "sccomp" : 1,
            "scripting" : 1,
325
            "sd" : 1,
326
            "sdext" : 1,
327
            "sfx2" : 1,
328 329 330 331 332 333 334
            "shell" : 1,
            "setup_native" : 1,
            "sot" : 1,
            "slideshow" : 1,
            "smoketest" : 1,
            "solenv" : 1,
            "soltools" : 1,
335
            "starmath" : 1,
336
            "stoc" : 1,
337 338
            "store" : 1,
            "svgio" : 1,
339
            "svl" : 1,
340
            "svtools" : 1,
Michael Meeks's avatar
Michael Meeks committed
341
            "svx" : 1,
Jens Carl's avatar
Jens Carl committed
342
            "sw" : 1,
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
            "test" : 1,
            "testtools" : 1,
            "toolkit" : 1,
            "tools" : 1,
            "touch" : 1,
            "ucb" : 1,
            "ucbhelper" : 1,
            "unodevtools" : 1,
            "unotest" : 1,
            "unoidl" : 1,
            "unotools" : 1,
            "unoxml" : 1,
            "uui" : 1,
            "vbahelper" : 1,
            "vcl" : 1,
            "winaccessibility" : 1,
            "writerfilter" : 1,
            "writerperfect" : 1,
            "xmlhelp" : 1,
            "xmloff" : 1,
            "xmlreader" : 1,
            "xmlsecurity" : 1,
            "xmlscript" : 1,
        }

368 369 370 371 372 373
        if globalscan:
            print("Scanning all files globally:")
        elif directory == '.':
            print("Scanning all files in our current directory:")
        else:
            print("Scanning all files in", directory + ":")
374 375

        num_checked = 0
376

377
        for path in lines:
378
            baseDir = self.first_elem(path)
Andrea Gelmini's avatar
Andrea Gelmini committed
379
            # If we have a globalscan use the whitelist.
380 381
            if globalscan:
                if not baseDir in directory_whitelist:
382
                    sys.stderr.write("\n - Error: Missing path %s -\n\n" % baseDir)
383 384 385 386 387
                    sys.exit(1)
                elif directory_whitelist[baseDir] is 0:
                    self.check_file(path.strip())
                    num_checked = num_checked + 1
                elif directory_whitelist[baseDir] is 1:
388
                    sys.stderr.write("Skipping whitelisted directory %s\n" % baseDir)
389 390
                    directory_whitelist[baseDir] = 2
            elif not globalscan:
391
                self.check_file(path.strip())
392 393
                num_checked = num_checked + 1

394
        print("Scanned %s files\n" % num_checked)
395 396 397 398

try:
    Parser()
except KeyboardInterrupt:
399
    print("Interrupted!")
400 401 402
    sys.exit(0)

# vim:set shiftwidth=4 softtabstop=4 expandtab: