Showing preview only (335K chars total). Download the full file or copy to clipboard to get everything.
Repository: tech-srl/code2seq
Branch: master
Commit: 8ca14173c323
Files: 71
Total size: 312.7 KB
Directory structure:
gitextract_t37_dsto/
├── .gitignore
├── CITATION.cff
├── CSharpExtractor/
│ ├── .gitattributes
│ ├── .gitignore
│ ├── CSharpExtractor/
│ │ ├── .nuget/
│ │ │ └── packages.config
│ │ ├── CSharpExtractor.sln
│ │ └── Extractor/
│ │ ├── Extractor.cs
│ │ ├── Extractor.csproj
│ │ ├── PathFinder.cs
│ │ ├── Program.cs
│ │ ├── Properties/
│ │ │ └── launchSettings.json
│ │ ├── Temp.cs
│ │ ├── Tree/
│ │ │ └── Tree.cs
│ │ ├── Utilities.cs
│ │ └── Variable.cs
│ └── extract.py
├── Input.java
├── JavaExtractor/
│ ├── JPredict/
│ │ ├── .classpath
│ │ ├── .gitignore
│ │ ├── src/
│ │ │ └── main/
│ │ │ └── java/
│ │ │ ├── JavaExtractor/
│ │ │ │ ├── App.java
│ │ │ │ ├── Common/
│ │ │ │ │ ├── CommandLineValues.java
│ │ │ │ │ ├── Common.java
│ │ │ │ │ └── MethodContent.java
│ │ │ │ ├── ExtractFeaturesTask.java
│ │ │ │ ├── FeatureExtractor.java
│ │ │ │ ├── FeaturesEntities/
│ │ │ │ │ ├── ProgramFeatures.java
│ │ │ │ │ ├── ProgramRelation.java
│ │ │ │ │ └── Property.java
│ │ │ │ └── Visitors/
│ │ │ │ ├── FunctionVisitor.java
│ │ │ │ └── LeavesCollectorVisitor.java
│ │ │ └── Test.java
│ │ └── target/
│ │ └── JavaExtractor-0.0.1-SNAPSHOT.jar
│ └── extract.py
├── LICENSE
├── Python150kExtractor/
│ ├── README.md
│ ├── extract.py
│ └── preprocess.sh
├── README.md
├── __init__.py
├── baseline_tokenization/
│ ├── input_example.txt
│ ├── javalang/
│ │ ├── __init__.py
│ │ ├── ast.py
│ │ ├── javadoc.py
│ │ ├── parse.py
│ │ ├── parser.py
│ │ ├── test/
│ │ │ ├── __init__.py
│ │ │ ├── source/
│ │ │ │ └── package-info/
│ │ │ │ ├── AnnotationJavadoc.java
│ │ │ │ ├── AnnotationOnly.java
│ │ │ │ ├── JavadocAnnotation.java
│ │ │ │ ├── JavadocOnly.java
│ │ │ │ └── NoAnnotationNoJavadoc.java
│ │ │ ├── test_java_8_syntax.py
│ │ │ ├── test_javadoc.py
│ │ │ ├── test_package_declaration.py
│ │ │ └── test_util.py
│ │ ├── tokenizer.py
│ │ ├── tree.py
│ │ └── util.py
│ └── subtokenize_nmt_baseline.py
├── code2seq.py
├── common.py
├── config.py
├── extractor.py
├── interactive_predict.py
├── model.py
├── preprocess.py
├── preprocess.sh
├── preprocess_csharp.sh
├── reader.py
├── train.sh
└── train_python150k.sh
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
*.class
*.lst
.idea/*
*.iml
*.xml
*.pyc
================================================
FILE: CITATION.cff
================================================
@inproceedings{
alon2018codeseq,
title={code2seq: Generating Sequences from Structured Representations of Code},
author={Uri Alon and Shaked Brody and Omer Levy and Eran Yahav},
booktitle={International Conference on Learning Representations},
year={2019},
url={https://openreview.net/forum?id=H1gKYo09tX},
}
================================================
FILE: CSharpExtractor/.gitattributes
================================================
###############################################################################
# Set default behavior to automatically normalize line endings.
###############################################################################
* text=auto
###############################################################################
# Set default behavior for command prompt diff.
#
# This is need for earlier builds of msysgit that does not have it on by
# default for csharp files.
# Note: This is only used by command line
###############################################################################
#*.cs diff=csharp
###############################################################################
# Set the merge driver for project and solution files
#
# Merging from the command prompt will add diff markers to the files if there
# are conflicts (Merging from VS is not affected by the settings below, in VS
# the diff markers are never inserted). Diff markers may cause the following
# file extensions to fail to load in VS. An alternative would be to treat
# these files as binary and thus will always conflict and require user
# intervention with every merge. To do so, just uncomment the entries below
###############################################################################
#*.sln merge=binary
#*.csproj merge=binary
#*.vbproj merge=binary
#*.vcxproj merge=binary
#*.vcproj merge=binary
#*.dbproj merge=binary
#*.fsproj merge=binary
#*.lsproj merge=binary
#*.wixproj merge=binary
#*.modelproj merge=binary
#*.sqlproj merge=binary
#*.wwaproj merge=binary
###############################################################################
# behavior for image files
#
# image files are treated as binary by default.
###############################################################################
#*.jpg binary
#*.png binary
#*.gif binary
###############################################################################
# diff behavior for common document formats
#
# Convert binary document formats to text before diffing them. This feature
# is only available from the command line. Turn it on by uncommenting the
# entries below.
###############################################################################
#*.doc diff=astextplain
#*.DOC diff=astextplain
#*.docx diff=astextplain
#*.DOCX diff=astextplain
#*.dot diff=astextplain
#*.DOT diff=astextplain
#*.pdf diff=astextplain
#*.PDF diff=astextplain
#*.rtf diff=astextplain
#*.RTF diff=astextplain
================================================
FILE: CSharpExtractor/.gitignore
================================================
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# DNX
project.lock.json
artifacts/
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.pch
*.pdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# TODO: Comment the next line if you want to checkin your web deploy settings
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/packages/*
# except build/, which is used as an MSBuild target.
!**/packages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/packages/repositories.config
# NuGet v3's project.json files produces more ignoreable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.pfx
*.publishsettings
node_modules/
orleans.codegen.cs
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
# SQL Server files
*.mdf
*.ldf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# no data
data/*
backupdata/*
================================================
FILE: CSharpExtractor/CSharpExtractor/.nuget/packages.config
================================================
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="NUnit.ConsoleRunner" version="3.6.0" />
<package id="NUnit.Extension.NUnitProjectLoader" version="3.5.0" />
<package id="NUnit.Extension.NUnitV2Driver" version="3.6.0" />
<package id="NUnit.Extension.NUnitV2ResultWriter" version="3.5.0" />
<package id="NUnit.Extension.TeamCityEventListener" version="1.0.2" />
<package id="NUnit.Extension.VSProjectLoader" version="3.5.0" />
<package id="NUnit3TestAdapter" version="3.7.0" />
</packages>
================================================
FILE: CSharpExtractor/CSharpExtractor/CSharpExtractor.sln
================================================
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.28307.136
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Extractor", "Extractor\Extractor.csproj", "{481EDE3F-0ED1-4CB9-814A-63A821022552}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Debug|x64 = Debug|x64
Debug|x86 = Debug|x86
Release|Any CPU = Release|Any CPU
Release|x64 = Release|x64
Release|x86 = Release|x86
Release20|Any CPU = Release20|Any CPU
Release20|x64 = Release20|x64
Release20|x86 = Release20|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|Any CPU.Build.0 = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x64.ActiveCfg = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x64.Build.0 = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x86.ActiveCfg = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x86.Build.0 = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|Any CPU.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|Any CPU.Build.0 = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x64.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x64.Build.0 = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x86.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x86.Build.0 = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|Any CPU.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|Any CPU.Build.0 = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x64.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x64.Build.0 = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x86.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {13A0DA89-D5D9-4E75-850E-70B9FBE88FF8}
EndGlobalSection
EndGlobal
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Extractor.cs
================================================
using Extractor.Semantics;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.CSharp.Syntax;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;
namespace Extractor
{
public class Extractor
{
public const string InternalDelimiter = "|";
public const string UpTreeChar = InternalDelimiter;
public const string DownTreeChar = InternalDelimiter;
public const string MethodNameConst = "METHOD_NAME";
public static SyntaxKind[] ParentTypeToAddChildId = new SyntaxKind[] { SyntaxKind.SimpleAssignmentExpression,
SyntaxKind.ElementAccessExpression, SyntaxKind.SimpleMemberAccessExpression, SyntaxKind.InvocationExpression, SyntaxKind.BracketedArgumentList, SyntaxKind.ArgumentList};
private ICollection<Variable> variables;
public int LengthLimit { get; set; }
public int WidthLimit { get; set; }
public string Code { get; set; }
public bool ShouldHash { get; set; }
public int MaxContexts { get; set; }
public Extractor(string code, Options opts)
{
LengthLimit = opts.MaxLength;
WidthLimit = opts.MaxWidth;
ShouldHash = !opts.NoHash;
MaxContexts = opts.MaxContexts;
Code = code;
}
StringBuilder builder = new StringBuilder();
private string PathNodesToString(PathFinder.Path path)
{
builder.Clear();
var nodeTypes = path.LeftSide;
if (nodeTypes.Count() > 0)
{
builder.Append(nodeTypes.First().Kind());
if (ParentTypeToAddChildId.Contains(nodeTypes.First().Parent.Kind()))
{
builder.Append(GetTruncatedChildId(nodeTypes.First()));
}
foreach (var n in nodeTypes.Skip(1))
{
builder.Append(UpTreeChar).Append(n.Kind());
if (ParentTypeToAddChildId.Contains(n.Parent.Kind()))
{
builder.Append(GetTruncatedChildId(n));
}
}
builder.Append(UpTreeChar);
}
builder.Append(path.Ancesstor.Kind());
nodeTypes = path.RightSide;
if (nodeTypes.Count() > 0)
{
builder.Append(DownTreeChar);
builder.Append(nodeTypes.First().Kind());
if (ParentTypeToAddChildId.Contains(nodeTypes.First().Parent.Kind()))
{
builder.Append(GetTruncatedChildId(nodeTypes.First()));
}
foreach (var n in nodeTypes.Skip(1))
{
builder.Append(DownTreeChar).Append(n.Kind());
if (ParentTypeToAddChildId.Contains(n.Parent.Kind()))
{
builder.Append(GetTruncatedChildId(n));
}
}
}
return builder.ToString();
}
private int GetTruncatedChildId(SyntaxNode n)
{
var parent = n.Parent;
int index = parent.ChildNodes().ToList().IndexOf(n);
if (index > 3)
{
index = 3;
}
return index;
}
private string PathToString(PathFinder.Path path)
{
SyntaxNode ancesstor = path.Ancesstor;
StringBuilder builder = new StringBuilder();
builder.Append(path.Left.Text).Append(UpTreeChar);
builder.Append(this.PathNodesToString(path));
builder.Append(DownTreeChar).Append(path.Right.Text);
return builder.ToString();
}
internal IEnumerable<PathFinder.Path> GetInternalPaths(Tree tree)
{
var finder = new PathFinder(tree, LengthLimit, WidthLimit);
var allPairs = Utilities.ReservoirSample(Utilities.WeakConcat(Utilities.Choose2(variables),
variables.Select((arg) => new Tuple<Variable, Variable>(arg, arg))), MaxContexts);
//iterate over variable-variable pairs
foreach (Tuple<Variable, Variable> varPair in allPairs)
{
bool pathToSelf = varPair.Item1 == varPair.Item2;
foreach (var rhs in varPair.Item2.Leaves)
foreach (var lhs in varPair.Item1.Leaves)
{
if (lhs == rhs)
continue;
PathFinder.Path path = finder.FindPath(lhs, rhs, limited: true);
if (path == null)
continue;
yield return path;
}
}
}
private string SplitNameUnlessEmpty(string original)
{
var subtokens = Utilities.SplitToSubtokens(original).Where(s => s.Length > 0);
String name = String.Join(InternalDelimiter, subtokens);
if (name.Length == 0)
{
name = Utilities.NormalizeName(original);
}
if (String.IsNullOrWhiteSpace(name))
{
name = "SPACE";
}
if (String.IsNullOrEmpty(name))
{
name = "BLANK";
}
if (original == Extractor.MethodNameConst)
{
name = original;
}
return name;
}
static readonly char[] removeFromComments = new char[] {' ', '/', '*', '{', '}'};
public List<String> Extract()
{
var tree = new Tree(CSharpSyntaxTree.ParseText(Code).GetRoot());
IEnumerable<MethodDeclarationSyntax> methods = tree.GetRoot().DescendantNodesAndSelf().OfType<MethodDeclarationSyntax>().ToList();
List<String> results = new List<string>();
foreach(var method in methods) {
String methodName = method.Identifier.ValueText;
Tree methodTree = new Tree(method);
var subtokensMethodName = Utilities.SplitToSubtokens(methodName);
var tokenToVar = new Dictionary<SyntaxToken, Variable>();
this.variables = Variable.CreateFromMethod(methodTree).ToArray();
foreach (var variable in variables)
{
foreach (SyntaxToken token in variable.Leaves)
{
tokenToVar[token] = variable;
}
}
List<String> contexts = new List<String>();
foreach (PathFinder.Path path in GetInternalPaths(methodTree))
{
String pathString = SplitNameUnlessEmpty(tokenToVar[path.Left].Name)
+ "," + MaybeHash(this.PathNodesToString(path))
+ "," + SplitNameUnlessEmpty(tokenToVar[path.Right].Name);
Debug.WriteLine(path.Left.FullSpan+" "+tokenToVar[path.Left].Name+ "," +this.PathNodesToString(path)+ "," + tokenToVar[path.Right].Name+" "+path.Right.FullSpan);
contexts.Add(pathString);
}
var commentNodes = tree.GetRoot().DescendantTrivia().Where(
node => node.IsKind(SyntaxKind.MultiLineCommentTrivia) || node.IsKind(SyntaxKind.SingleLineCommentTrivia) || node.IsKind(SyntaxKind.MultiLineDocumentationCommentTrivia));
foreach (SyntaxTrivia trivia in commentNodes)
{
string commentText = trivia.ToString().Trim(removeFromComments);
string normalizedTrivia = SplitNameUnlessEmpty(commentText);
var parts = normalizedTrivia.Split('|');
for (int i = 0; i < Math.Ceiling((double)parts.Length / (double)5); i++)
{
var batch = String.Join("|", parts.Skip(i * 5).Take(5));
contexts.Add(batch + "," + "COMMENT" + "," + batch);
}
}
results.Add(String.Join("|", subtokensMethodName) + " " + String.Join(" ", contexts));
}
return results;
}
private string MaybeHash(string v)
{
if (this.ShouldHash)
{
return v.GetHashCode().ToString();
} else
{
return v;
}
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Extractor.csproj
================================================
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp2.2</TargetFramework>
<StartupObject>Extractor.Program</StartupObject>
</PropertyGroup>
<ItemGroup>
<Compile Remove="Temp.cs" />
</ItemGroup>
<ItemGroup>
<None Include="Temp.cs" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="CommandLineParser" Version="2.3.0" />
<PackageReference Include="Microsoft.CodeAnalysis" Version="2.10.0" />
</ItemGroup>
</Project>
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/PathFinder.cs
================================================
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp.Syntax;
using System;
using System.Collections.Generic;
using System.Linq;
namespace Extractor
{
internal class PathFinder
{
internal class Path
{
public SyntaxToken Left { get; }
public List<SyntaxNode> LeftSide { get; }
public SyntaxNode Ancesstor { get; }
public List<SyntaxNode> RightSide { get; }
public SyntaxToken Right { get; }
public Path(SyntaxToken left, IEnumerable<SyntaxNode> leftSide, SyntaxNode ancesstor,
IEnumerable<SyntaxNode> rightSide, SyntaxToken right)
{
this.Left = left;
this.LeftSide = leftSide.ToList();
this.Ancesstor = ancesstor;
this.RightSide = rightSide.ToList();
this.Right = right;
}
}
public int Length { get; }
public int Width { get; }
Tree tree;
public PathFinder(Tree tree, int length = 7, int width = 4)
{
if (length < 1 || width < 1)
throw new ArgumentException("Width and Length params must be positive.");
Length = length;
Width = width;
this.tree = tree;
}
private int GetDepth(SyntaxNode n)
{
int depth = 0;
while(n.Parent != null)
{
n = n.Parent;
depth++;
}
return depth;
}
public SyntaxNode FirstAncestor(SyntaxNode l, SyntaxNode r)
{
if (l.Equals(r))
return l;
if (GetDepth(l) >= GetDepth(r))
{
l = l.Parent;
}
else
{
r = r.Parent;
}
return FirstAncestor(l, r);
}
private IEnumerable<SyntaxNode> CollectPathToParent(SyntaxNode start, SyntaxNode parent)
{
while (!start.Equals(parent))
{
yield return start;
start = start.Parent;
}
}
internal Path FindPath(SyntaxToken l, SyntaxToken r, bool limited = true)
{
SyntaxNode p = FirstAncestor(l.Parent, r.Parent);
// + 2 for the distance of the leafs themselves
if (GetDepth(r.Parent) + GetDepth(l.Parent) - 2 * GetDepth(p) + 2 > Length)
{
return null;
}
var leftSide = CollectPathToParent(l.Parent, p);
var rightSide = CollectPathToParent(r.Parent, p);
rightSide = rightSide.Reverse();
List<SyntaxNode> widthCheck = p.ChildNodes().ToList();
if (limited && leftSide.Count() != 0
&& rightSide.Count() != 0)
{
int indexOfLeft = widthCheck.IndexOf(leftSide.Last());
int indexOfRight = widthCheck.IndexOf(rightSide.First());
if (Math.Abs(indexOfLeft - indexOfRight) >= Width)
{
return null;
}
}
return new Path(l, leftSide, p, rightSide, r);
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Program.cs
================================================
using CommandLine;
using CommandLine.Text;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace Extractor
{
class Program
{
static List<String> ExtractSingleFile(string filename, Options opts)
{
string data = File.ReadAllText(filename);
var extractor = new Extractor(data, opts);
List<String> result = extractor.Extract();
return result;
}
static void Main(string[] args)
{
Options options = new Options();
Parser.Default.ParseArguments<Options>(args)
.WithParsed(opt => options = opt)
.WithNotParsed(errors =>
{
Console.WriteLine(errors);
return;
});
string path = options.Path;
string[] files;
if (Directory.Exists(path))
{
files = Directory.GetFiles(path, "*.cs", SearchOption.AllDirectories);
}
else
{
files = new string[] { path };
}
IEnumerable<string> results = null;
results = files.AsParallel().WithDegreeOfParallelism(options.Threads).SelectMany(filename => ExtractSingleFile(filename, options));
using (StreamWriter sw = new StreamWriter(options.OFileName, append: true))
{
foreach (var res in results)
{
sw.WriteLine(res);
}
}
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Properties/launchSettings.json
================================================
{
"profiles": {
"Extractor": {
"commandName": "Project",
"commandLineArgs": "--path C:\\Users\\urial\\Source\\Repos\\CSharpExtractor\\CSharpExtractor\\Extractor\\bin\\ --no_hash"
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Temp.cs
================================================
namespace Extractor
{
class Temp
{
class NestedClass
{
void fooBar()
{
a.b = c;
}
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Tree/Tree.cs
================================================
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.CSharp.Syntax;
namespace Extractor
{
public class Tree
{
public const string DummyClass = "IgnoreDummyClass";
public const string DummyMethodName = "IgnoreDummyMethod";
public const string DummyType = "IgnoreDummyType";
internal static readonly SyntaxKind[] literals = { SyntaxKind.NumericLiteralToken, SyntaxKind.StringLiteralToken, SyntaxKind.CharacterLiteralToken };
internal static readonly HashSet<SyntaxKind> identifiers = new HashSet<SyntaxKind>(new SyntaxKind[] { SyntaxKind.IdentifierToken }); //, SyntaxKind.VoidKeyword, SyntaxKind.StringKeyword });
internal static readonly HashSet<SyntaxKind> keywords = new HashSet<SyntaxKind>(new SyntaxKind[] { SyntaxKind.RefKeyword, SyntaxKind.OutKeyword, SyntaxKind.ConstKeyword });
internal static readonly HashSet<SyntaxKind> declarations = new HashSet<SyntaxKind>(new SyntaxKind[] { SyntaxKind.VariableDeclarator, SyntaxKind.Parameter, SyntaxKind.CatchDeclaration, SyntaxKind.ForEachStatement });
internal static readonly HashSet<SyntaxKind> memberAccesses = new HashSet<SyntaxKind>(new SyntaxKind[] { SyntaxKind.SimpleMemberAccessExpression, SyntaxKind.PointerMemberAccessExpression });
internal static readonly HashSet<SyntaxKind> scopeEnders = new HashSet<SyntaxKind>(
new SyntaxKind[]{ SyntaxKind.Block, SyntaxKind.ForStatement, SyntaxKind.MethodDeclaration,
SyntaxKind.ForEachStatement, SyntaxKind.CatchClause, SyntaxKind.SwitchSection, SyntaxKind.UsingStatement });
internal static readonly HashSet<SyntaxKind> lambdaScopeStarters = new HashSet<SyntaxKind>(
new SyntaxKind[]{ SyntaxKind.AnonymousMethodExpression,
SyntaxKind.SimpleLambdaExpression, SyntaxKind.ParenthesizedLambdaExpression });
public static bool IsScopeEnder(SyntaxNode node)
{
return Tree.scopeEnders.Contains(node.Kind());
}
class TreeBuilderWalker : CSharpSyntaxWalker
{
Dictionary<SyntaxNode, Node> nodes;
HashSet<SyntaxNode> visitedNodes;
List<SyntaxNode> Desc;
List<SyntaxToken> Tokens;
Dictionary<SyntaxToken, Leaf> tokens;
internal TreeBuilderWalker(Dictionary<SyntaxNode, Node> nodes, Dictionary<SyntaxToken, Leaf> tokens)
{
visitedNodes = new HashSet<SyntaxNode>();
this.nodes = nodes;
this.tokens = tokens;
}
public override
void Visit(SyntaxNode node)
{
visitedNodes.Add(node);
base.Visit(node);
visitedNodes.Remove(node);
Desc = new List<SyntaxNode>();
Tokens = new List<SyntaxToken>();
foreach (var c in node.ChildNodes())
{
if (!nodes.ContainsKey(c))
{
continue;
}
Desc.AddRange(nodes[c].Descendents);
Desc.Add(c);
Tokens.AddRange(nodes[c].Leaves);
}
foreach (var token in node.ChildTokens())
{
if (Leaf.IsLeafToken(token))
{
tokens[token] = new Leaf(nodes, token);
Tokens.Add(token);
}
}
Node res = new Node(This: node,
Ancestors: new HashSet<SyntaxNode>(visitedNodes),
Descendents: Desc.ToArray(),
Leaves: Tokens.ToArray(),
Kind: node.Kind());
nodes[node] = res;
}
}
internal SyntaxNode GetRoot()
{
return tree;
}
SyntaxNode tree;
internal Dictionary<SyntaxNode, Node> nodes = new Dictionary<SyntaxNode, Node>();
internal Dictionary<SyntaxToken, Leaf> leaves = new Dictionary<SyntaxToken, Leaf>();
public Tree(SyntaxNode syntaxTree)
{
this.tree = syntaxTree;
/*if (this.tree.ChildNodes().ToList().Count() == 0)
{
this.tree = CSharpSyntaxTree.ParseText($"private {DummyType} {DummyMethodName}() {{ {code} }}");
}*/
new TreeBuilderWalker(nodes, leaves).Visit(this.tree);
List<SyntaxTrivia> commentNodes = tree.DescendantTrivia().Where(
node => node.IsKind(SyntaxKind.MultiLineCommentTrivia) || node.IsKind(SyntaxKind.SingleLineCommentTrivia)).ToList();
}
}
public class Node
{
public Node(SyntaxNode This, HashSet<SyntaxNode> Ancestors, SyntaxNode[] Descendents,
SyntaxToken[] Leaves, SyntaxKind Kind)
{
this.This = This;
this.Ancestors = Ancestors;
this.Descendents = Descendents;
this.AncestorsAndSelf = new HashSet<SyntaxNode>(Ancestors);
this.AncestorsAndSelf.Add(This);
this.Leaves = Leaves;
this.Depth = Depth;
this.Kind = Kind;
this.KindName = Kind.ToString();
}
public SyntaxNode This { get; }
public HashSet<SyntaxNode> Ancestors { get; }
public HashSet<SyntaxNode> AncestorsAndSelf { get; }
public SyntaxNode[] Descendents { get; }
public SyntaxToken[] Leaves { get; }
public SyntaxKind Kind { get; }
public string KindName { get; }
public int Depth { get; }
public override bool Equals(object obj)
{
var item = obj as Node;
if (item == null)
{
return false;
}
return this.This.Equals(item.This);
}
public override int GetHashCode()
{
return this.This.GetHashCode();
}
}
public class Leaf
{
internal static bool IsLeafToken(SyntaxToken token)
{
if (token.Text.Equals("var") && token.IsKind(SyntaxKind.IdentifierToken)
&& token.Parent.IsKind(SyntaxKind.IdentifierName) && token.Parent.Parent.IsKind(SyntaxKind.VariableDeclaration)
&& token.Parent.Parent.Parent.IsKind(SyntaxKind.LocalDeclarationStatement))
{
return false;
}
if (token.ValueText == Tree.DummyMethodName || token.ValueText == Tree.DummyType)
{
return false;
}
return Tree.identifiers.Contains(token.Kind()) || Tree.literals.Contains(token.Kind()) || token.Parent.Kind() == SyntaxKind.PredefinedType;
}
public SyntaxToken token { get; }
public SyntaxKind Kind { get; }
public string KindName { get; }
public string Text { get; set; }
public bool IsConst { get; }
public string VariableName { get; }
public Leaf(Dictionary<SyntaxNode, Node> nodes, SyntaxToken token)
{
this.token = token;
Kind = token.Kind();
KindName = Kind.ToString();
IsConst = !(Tree.identifiers.Contains(Kind) && Tree.declarations.Contains(token.Parent.Kind()));
Text = token.ValueText;
SyntaxNode node = token.Parent.Parent;
SyntaxNode current = token.Parent;
VariableName = Text;
}
}
public class SyntaxViewer
{
private string ToDot(SyntaxTree tree)
{
List<SyntaxNode> nodes = tree.GetRoot().DescendantNodesAndSelf().ToList();
SyntaxToken[] tokens = tree.GetRoot().DescendantTokens().ToArray();
string[] tokenStrings = tokens.Select((arg) => arg.Kind().ToString() + "-" + arg.ToString()).ToArray();
string[] nodeStrings = nodes.Select((arg) => arg.Kind().ToString()).ToArray();
Dictionary<string, int> counts = new Dictionary<string, int>();
Dictionary<int, string> nodeNames = new Dictionary<int, string>();
IEnumerable<string> allItems = nodeStrings.Concat(tokenStrings);
int i = 0;
foreach (string name in allItems)
{
if (!counts.ContainsKey(name))
counts[name] = 0;
counts[name] += 1;
nodeNames[i] = name + counts[name].ToString();
i++;
}
StringBuilder builder = new StringBuilder();
builder.AppendLine("digraph G {");
// vertexes
for (i = 0; i < allItems.Count(); i++)
{
builder.AppendFormat("\"{0}\" ;\n", nodeNames[i]);
}
builder.AppendLine();
// edges
for (i = 1; i < nodes.Count(); i++)
{
builder.AppendFormat("\"{0}\"->\"{1}\" [];\n", nodeNames[nodes.IndexOf(nodes[i].Parent)], nodeNames[i]);
}
for (i = 0; i < tokens.Count(); i++)
{
builder.AppendFormat("\"{0}\"->\"{1}\" [];\n", nodeNames[nodes.IndexOf(tokens[i].Parent)], nodeNames[i + nodes.Count()]);
}
builder.AppendLine("}");
return builder.ToString();
}
public SyntaxViewer(SyntaxTree tree, string path = "out.ong")
{
string dotData = ToDot(tree);
File.WriteAllText("out.dot", dotData);
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs
================================================
using CommandLine;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;
using System.Text.RegularExpressions;
namespace Extractor
{
public class Options
{
[Option('t', "threads", Default = 1, HelpText = "How many threads to use <1>")]
public int Threads { get; set; }
[Option('p', "path", Default = "./data/", HelpText = "Where to find code files. <.>")]
public string Path { get; set; }
[Option('l', "max_length", Default = 9, HelpText = "Max path length")]
public int MaxLength { get; set; }
[Option('l', "max_width", Default = 2, HelpText = "Max path length")]
public int MaxWidth { get; set; }
[Option('o', "ofile_name", Default = "test.txt", HelpText = "Output file name")]
public String OFileName { get; set; }
[Option('h', "no_hash", Default = true, HelpText = "When enabled, prints the whole path strings (not hashed)")]
public Boolean NoHash { get; set; }
[Option('l', "max_contexts", Default = 30000, HelpText = "Max number of path contexts to sample. Affects only very large snippets")]
public int MaxContexts { get; set; }
}
public static class Utilities
{
public static String[] NumbericLiteralsToKeep = new String[] { "0", "1", "2", "3", "4", "5", "10" };
public static IEnumerable<Tuple<T, T>> Choose2<T>(IEnumerable<T> enumerable)
{
int index = 0;
foreach (var e in enumerable)
{
++index;
foreach (var t in enumerable.Skip(index))
yield return Tuple.Create(e, t);
}
}
/// <summary>
/// Sample uniform randomly numSamples from an enumerable, using reservoir sampling.
/// See https://en.wikipedia.org/wiki/Reservoir_sampling
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="input"></param>
/// <param name="numSamples"></param>
/// <returns></returns>
public static IEnumerable<TSource> ReservoirSample<TSource>(this IEnumerable<TSource> input, int numSamples)
{
var rng = new Random();
var sampledElements = new List<TSource>(numSamples);
int seenElementCount = 0;
foreach (var element in input)
{
seenElementCount++;
if (sampledElements.Count < numSamples)
{
sampledElements.Add(element);
}
else
{
int position = rng.Next(seenElementCount);
if (position < numSamples)
{
sampledElements[position] = element;
}
}
}
Debug.Assert(sampledElements.Count <= numSamples);
return sampledElements;
}
public static IEnumerable<T> WeakConcat<T>(IEnumerable<T> enumerable1, IEnumerable<T> enumerable2)
{
foreach (T t in enumerable1)
yield return t;
foreach (T t in enumerable2)
yield return t;
}
public static IEnumerable<String> SplitToSubtokens(String name)
{
return Regex.Split(name.Trim(), "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+")
.Where(s => s.Length > 0)
.Select(s => NormalizeName(s))
.Where(s => s.Length > 0);
}
private static Regex Whitespaces = new Regex(@"\s");
private static Regex NonAlphabetic = new Regex("[^A-Za-z]");
public static String NormalizeName(string s)
{
String partiallyNormalized = s.ToLowerInvariant()
.Replace("\\\\n", String.Empty)
.Replace("[\"',]", String.Empty);
partiallyNormalized = Whitespaces.Replace(partiallyNormalized, "");
partiallyNormalized = Encoding.ASCII.GetString(
Encoding.Convert(
Encoding.UTF8,
Encoding.GetEncoding(
Encoding.ASCII.EncodingName,
new EncoderReplacementFallback(string.Empty),
new DecoderExceptionFallback()
),
Encoding.UTF8.GetBytes(partiallyNormalized)
)
);
if (partiallyNormalized.Contains('\n'))
{
partiallyNormalized = partiallyNormalized.Replace('\n', 'N');
}
if (partiallyNormalized.Contains('\r'))
{
partiallyNormalized = partiallyNormalized.Replace('\r', 'R');
}
if (partiallyNormalized.Contains(','))
{
partiallyNormalized = partiallyNormalized.Replace(',', 'C');
}
String completelyNormalized = NonAlphabetic.Replace(partiallyNormalized, String.Empty);
if (completelyNormalized.Length == 0)
{
if (Regex.IsMatch(partiallyNormalized, @"^\d+$"))
{
if (NumbericLiteralsToKeep.Contains(partiallyNormalized))
{
return partiallyNormalized;
}
else
{
return "NUM";
}
}
return String.Empty;
}
return completelyNormalized;
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Variable.cs
================================================
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp.Syntax;
namespace Extractor
{
namespace Semantics
{
public class Variable
{
Tree tree;
public string Name { get; }
private HashSet<SyntaxToken> leaves;
public HashSet<SyntaxToken> Leaves
{
get
{
return leaves;
}
}
private Nullable<bool> constant;
public bool Const
{
get
{
return constant.Value;
}
}
private Variable(string name, SyntaxToken[] leaves, Tree tree)
{
this.tree = tree;
this.Name = name;
this.leaves = new HashSet<SyntaxToken>(leaves);
constant = true;
foreach (var leaf in leaves)
{
if (!tree.leaves[leaf].IsConst)
{
constant = false;
// If not constant the it is a decleration token
break;
}
}
}
public override int GetHashCode()
{
return this.Name.GetHashCode();
}
public bool IsLiteral()
{
return Tree.literals.Contains(tree.leaves[Leaves.First()].Kind);
}
internal static Boolean isMethodName(SyntaxToken token)
{
return token.Parent.IsKind(Microsoft.CodeAnalysis.CSharp.SyntaxKind.MethodDeclaration)
&& token.IsKind(Microsoft.CodeAnalysis.CSharp.SyntaxKind.IdentifierToken);
}
// Create a variable for each variable in scope from tokens while splitting identically named but differently scoped vars.
internal static IEnumerable<Variable> CreateFromMethod(Tree methodTree)
{
var root = methodTree.nodes[methodTree.GetRoot()];
var leaves = root.Leaves.ToArray();
Dictionary<SyntaxToken, string> tokenToName = new Dictionary<SyntaxToken, string>();
Dictionary<string, List<SyntaxToken>> nameToTokens = new Dictionary<string, List<SyntaxToken>>();
foreach (SyntaxToken token in root.Leaves)
{
string name = methodTree.leaves[token].VariableName;
if (isMethodName(token))
{
name = Extractor.MethodNameConst;
}
tokenToName[token] = name;
if (!nameToTokens.ContainsKey(name))
nameToTokens[name] = new List<SyntaxToken>();
nameToTokens[name].Add(token);
}
List<Variable> results = new List<Variable>();
foreach (SyntaxToken leaf in leaves)
{
string name = tokenToName[leaf];
SyntaxToken[] syntaxTokens = nameToTokens[name].ToArray();
var v = new Variable(name, syntaxTokens, methodTree);
//check if exists
var matches = results.Where(p => p.Name == name).ToList();
bool alreadyExists = (matches.Count != 0);
if (!alreadyExists)
{
results.Add(v);
}
}
return results;
}
}
}
}
================================================
FILE: CSharpExtractor/extract.py
================================================
#!/usr/bin/python
import itertools
import multiprocessing
import os
import sys
import shutil
import subprocess
from threading import Timer
import sys
from argparse import ArgumentParser
from subprocess import Popen, PIPE, STDOUT, call
def get_immediate_subdirectories(a_dir):
return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir)
if os.path.isdir(os.path.join(a_dir, name))]
TMP_DIR = ""
def ParallelExtractDir(args, dir):
ExtractFeaturesForDir(args, dir, "")
def ExtractFeaturesForDir(args, dir, prefix):
command = ['dotnet', 'run', '--project', args.csproj,
'--max_length', str(args.max_path_length), '--max_width', str(args.max_path_width),
'--path', dir, '--threads', str(args.num_threads), '--ofile_name', str(args.ofile_name)]
# print command
# os.system(command)
kill = lambda process: process.kill()
sleeper = subprocess.Popen(command, stderr=subprocess.PIPE)
timer = Timer(600000, kill, [sleeper])
try:
timer.start()
_, stderr = sleeper.communicate()
finally:
timer.cancel()
if sleeper.poll() == 0:
if len(stderr) > 0:
print(sys.stderr, stderr)
else:
print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time')
failed = True
subdirs = get_immediate_subdirectories(dir)
for subdir in subdirs:
ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')
if failed:
if os.path.exists(str(args.ofile_name)):
os.remove(str(args.ofile_name))
def ExtractFeaturesForDirsList(args, dirs):
global TMP_DIR
TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid())
if os.path.exists(TMP_DIR):
shutil.rmtree(TMP_DIR, ignore_errors=True)
os.makedirs(TMP_DIR)
try:
p = multiprocessing.Pool(4)
p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs))
#for dir in dirs:
# ExtractFeaturesForDir(args, dir, '')
output_files = os.listdir(TMP_DIR)
for f in output_files:
os.system("cat %s/%s" % (TMP_DIR, f))
finally:
shutil.rmtree(TMP_DIR, ignore_errors=True)
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8)
parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2)
parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64)
parser.add_argument("--csproj", dest="csproj", required=True)
parser.add_argument("-dir", "--dir", dest="dir", required=False)
parser.add_argument("-ofile_name", "--ofile_name", dest="ofile_name", required=True)
args = parser.parse_args()
if args.dir is not None:
subdirs = get_immediate_subdirectories(args.dir)
to_extract = subdirs
if len(subdirs) == 0:
to_extract = [args.dir.rstrip('/')]
ExtractFeaturesForDirsList(args, to_extract)
================================================
FILE: Input.java
================================================
public String getName() {
return name;
}
================================================
FILE: JavaExtractor/JPredict/.classpath
================================================
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry excluding="Test.java" kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>
================================================
FILE: JavaExtractor/JPredict/.gitignore
================================================
/target/
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java
================================================
package JavaExtractor;
import JavaExtractor.Common.CommandLineValues;
import org.kohsuke.args4j.CmdLineException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadPoolExecutor;
public class App {
private static CommandLineValues s_CommandLineValues;
public static void main(String[] args) {
try {
s_CommandLineValues = new CommandLineValues(args);
} catch (CmdLineException e) {
e.printStackTrace();
return;
}
if (s_CommandLineValues.File != null) {
ExtractFeaturesTask extractFeaturesTask = new ExtractFeaturesTask(s_CommandLineValues,
s_CommandLineValues.File.toPath());
extractFeaturesTask.processFile();
} else if (s_CommandLineValues.Dir != null) {
extractDir();
}
}
private static void extractDir() {
ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(s_CommandLineValues.NumThreads);
LinkedList<ExtractFeaturesTask> tasks = new LinkedList<>();
try {
Files.walk(Paths.get(s_CommandLineValues.Dir)).filter(Files::isRegularFile)
.filter(p -> p.toString().toLowerCase().endsWith(".java")).forEach(f -> {
ExtractFeaturesTask task = new ExtractFeaturesTask(s_CommandLineValues, f);
tasks.add(task);
});
} catch (IOException e) {
e.printStackTrace();
return;
}
List<Future<Void>> tasksResults = null;
try {
tasksResults = executor.invokeAll(tasks);
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
executor.shutdown();
}
tasksResults.forEach(f -> {
try {
f.get();
} catch (InterruptedException | ExecutionException e) {
e.printStackTrace();
}
});
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java
================================================
package JavaExtractor.Common;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import java.io.File;
/**
* This class handles the programs arguments.
*/
public class CommandLineValues {
@Option(name = "--file", required = false)
public File File = null;
@Option(name = "--dir", required = false, forbids = "--file")
public String Dir = null;
@Option(name = "--max_path_length", required = true)
public int MaxPathLength;
@Option(name = "--max_path_width", required = true)
public int MaxPathWidth;
@Option(name = "--num_threads", required = false)
public int NumThreads = 64;
@Option(name = "--min_code_len", required = false)
public int MinCodeLength = 1;
@Option(name = "--max_code_len", required = false)
public int MaxCodeLength = -1;
@Option(name = "--max_file_len", required = false)
public int MaxFileLength = -1;
@Option(name = "--pretty_print", required = false)
public boolean PrettyPrint = false;
@Option(name = "--max_child_id", required = false)
public int MaxChildId = 3;
@Option(name = "--json_output", required = false)
public boolean JsonOutput = false;
public CommandLineValues(String... args) throws CmdLineException {
CmdLineParser parser = new CmdLineParser(this);
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
throw e;
}
}
public CommandLineValues() {
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java
================================================
package JavaExtractor.Common;
import JavaExtractor.FeaturesEntities.Property;
import com.github.javaparser.ast.Node;
import com.github.javaparser.ast.UserDataKey;
import java.util.ArrayList;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public final class Common {
public static final UserDataKey<Property> PropertyKey = new UserDataKey<Property>() {
};
public static final UserDataKey<Integer> ChildId = new UserDataKey<Integer>() {
};
public static final String EmptyString = "";
public static final String MethodDeclaration = "MethodDeclaration";
public static final String NameExpr = "NameExpr";
public static final String BlankWord = "BLANK";
public static final int c_MaxLabelLength = 50;
public static final String methodName = "METHOD_NAME";
public static final String internalSeparator = "|";
public static String normalizeName(String original, String defaultString) {
original = original.toLowerCase().replaceAll("\\\\n", "") // escaped new
// lines
.replaceAll("//s+", "") // whitespaces
.replaceAll("[\"',]", "") // quotes, apostrophies, commas
.replaceAll("\\P{Print}", ""); // unicode weird characters
String stripped = original.replaceAll("[^A-Za-z]", "");
if (stripped.length() == 0) {
String carefulStripped = original.replaceAll(" ", "_");
if (carefulStripped.length() == 0) {
return defaultString;
} else {
return carefulStripped;
}
} else {
return stripped;
}
}
public static boolean isMethod(Node node, String type) {
Property parentProperty = node.getParentNode().getUserData(Common.PropertyKey);
if (parentProperty == null) {
return false;
}
String parentType = parentProperty.getType();
return Common.NameExpr.equals(type) && Common.MethodDeclaration.equals(parentType);
}
public static ArrayList<String> splitToSubtokens(String str1) {
String str2 = str1.replace("|", " ");
String str3 = str2.trim();
return Stream.of(str3.split("(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+"))
.filter(s -> s.length() > 0).map(s -> Common.normalizeName(s, Common.EmptyString))
.filter(s -> s.length() > 0).collect(Collectors.toCollection(ArrayList::new));
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java
================================================
package JavaExtractor.Common;
import com.github.javaparser.ast.Node;
import java.util.ArrayList;
public class MethodContent {
private final ArrayList<Node> leaves;
private final String name;
private final String content;
public MethodContent(ArrayList<Node> leaves, String name, String content) {
this.leaves = leaves;
this.name = name;
this.content = content;
}
public ArrayList<Node> getLeaves() {
return leaves;
}
public String getName() {
return name;
}
public String getContent() {
return content;
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java
================================================
package JavaExtractor;
import JavaExtractor.Common.CommandLineValues;
import JavaExtractor.Common.Common;
import JavaExtractor.FeaturesEntities.ProgramFeatures;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import com.google.gson.Gson;
class ExtractFeaturesTask implements Callable<Void> {
private final CommandLineValues commandLineValues;
private final Path filePath;
public ExtractFeaturesTask(CommandLineValues commandLineValues, Path path) {
this.commandLineValues = commandLineValues;
this.filePath = path;
}
@Override
public Void call() {
processFile();
return null;
}
public void processFile() {
ArrayList<ProgramFeatures> features;
try {
features = extractSingleFile();
} catch (IOException e) {
e.printStackTrace();
return;
}
if (features == null) {
return;
}
String toPrint = featuresToString(features);
if (toPrint.length() > 0) {
System.out.println(toPrint);
}
}
private ArrayList<ProgramFeatures> extractSingleFile() throws IOException {
String code;
if (commandLineValues.MaxFileLength > 0 &&
Files.lines(filePath, Charset.defaultCharset()).count() > commandLineValues.MaxFileLength) {
return new ArrayList<>();
}
try {
code = new String(Files.readAllBytes(filePath));
} catch (IOException e) {
e.printStackTrace();
code = Common.EmptyString;
}
FeatureExtractor featureExtractor = new FeatureExtractor(commandLineValues, this.filePath);
return featureExtractor.extractFeatures(code);
}
public String featuresToString(ArrayList<ProgramFeatures> features) {
if (features == null || features.isEmpty()) {
return Common.EmptyString;
}
List<String> methodsOutputs = new ArrayList<>();
for (ProgramFeatures singleMethodFeatures : features) {
StringBuilder builder = new StringBuilder();
String toPrint;
if (commandLineValues.JsonOutput) {
toPrint = new Gson().toJson(singleMethodFeatures);
}
else {
toPrint = singleMethodFeatures.toString();
}
if (commandLineValues.PrettyPrint) {
toPrint = toPrint.replace(" ", "\n\t");
}
builder.append(toPrint);
methodsOutputs.add(builder.toString());
}
return StringUtils.join(methodsOutputs, "\n");
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeatureExtractor.java
================================================
package JavaExtractor;
import JavaExtractor.Common.CommandLineValues;
import JavaExtractor.Common.Common;
import JavaExtractor.Common.MethodContent;
import JavaExtractor.FeaturesEntities.ProgramFeatures;
import JavaExtractor.FeaturesEntities.Property;
import JavaExtractor.Visitors.FunctionVisitor;
import com.github.javaparser.JavaParser;
import com.github.javaparser.ParseProblemException;
import com.github.javaparser.ast.CompilationUnit;
import com.github.javaparser.ast.Node;
import java.io.File;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.util.StringJoiner;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@SuppressWarnings("StringEquality")
class FeatureExtractor {
private final static String upSymbol = "|";
private final static String downSymbol = "|";
private static final Set<String> s_ParentTypeToAddChildId = Stream
.of("AssignExpr", "ArrayAccessExpr", "FieldAccessExpr", "MethodCallExpr")
.collect(Collectors.toCollection(HashSet::new));
private final CommandLineValues m_CommandLineValues;
private final Path filePath;
public FeatureExtractor(CommandLineValues commandLineValues, Path filePath) {
this.m_CommandLineValues = commandLineValues;
this.filePath = filePath;
}
private static ArrayList<Node> getTreeStack(Node node) {
ArrayList<Node> upStack = new ArrayList<>();
Node current = node;
while (current != null) {
upStack.add(current);
current = current.getParentNode();
}
return upStack;
}
public ArrayList<ProgramFeatures> extractFeatures(String code) {
CompilationUnit m_CompilationUnit = parseFileWithRetries(code);
FunctionVisitor functionVisitor = new FunctionVisitor(m_CommandLineValues);
functionVisitor.visit(m_CompilationUnit, null);
ArrayList<MethodContent> methods = functionVisitor.getMethodContents();
return generatePathFeatures(methods);
}
private CompilationUnit parseFileWithRetries(String code) {
final String classPrefix = "public class Test {";
final String classSuffix = "}";
final String methodPrefix = "SomeUnknownReturnType f() {";
final String methodSuffix = "return noSuchReturnValue; }";
String content = code;
CompilationUnit parsed;
try {
parsed = JavaParser.parse(content);
} catch (ParseProblemException e1) {
// Wrap with a class and method
try {
content = classPrefix + methodPrefix + code + methodSuffix + classSuffix;
parsed = JavaParser.parse(content);
} catch (ParseProblemException e2) {
// Wrap with a class only
content = classPrefix + code + classSuffix;
parsed = JavaParser.parse(content);
}
}
return parsed;
}
private ArrayList<ProgramFeatures> generatePathFeatures(ArrayList<MethodContent> methods) {
ArrayList<ProgramFeatures> methodsFeatures = new ArrayList<>();
for (MethodContent content : methods) {
ProgramFeatures singleMethodFeatures = generatePathFeaturesForFunction(content);
if (!singleMethodFeatures.isEmpty()) {
methodsFeatures.add(singleMethodFeatures);
}
}
return methodsFeatures;
}
private ProgramFeatures generatePathFeaturesForFunction(MethodContent methodContent) {
ArrayList<Node> functionLeaves = methodContent.getLeaves();
ProgramFeatures programFeatures = new ProgramFeatures(
methodContent.getName(), this.filePath, methodContent.getContent());
for (int i = 0; i < functionLeaves.size(); i++) {
for (int j = i + 1; j < functionLeaves.size(); j++) {
String separator = Common.EmptyString;
String path = generatePath(functionLeaves.get(i), functionLeaves.get(j), separator);
if (path != Common.EmptyString) {
Property source = functionLeaves.get(i).getUserData(Common.PropertyKey);
Property target = functionLeaves.get(j).getUserData(Common.PropertyKey);
programFeatures.addFeature(source, path, target);
}
}
}
return programFeatures;
}
private String generatePath(Node source, Node target, String separator) {
StringJoiner stringBuilder = new StringJoiner(separator);
ArrayList<Node> sourceStack = getTreeStack(source);
ArrayList<Node> targetStack = getTreeStack(target);
int commonPrefix = 0;
int currentSourceAncestorIndex = sourceStack.size() - 1;
int currentTargetAncestorIndex = targetStack.size() - 1;
while (currentSourceAncestorIndex >= 0 && currentTargetAncestorIndex >= 0
&& sourceStack.get(currentSourceAncestorIndex) == targetStack.get(currentTargetAncestorIndex)) {
commonPrefix++;
currentSourceAncestorIndex--;
currentTargetAncestorIndex--;
}
int pathLength = sourceStack.size() + targetStack.size() - 2 * commonPrefix;
if (pathLength > m_CommandLineValues.MaxPathLength) {
return Common.EmptyString;
}
if (currentSourceAncestorIndex >= 0 && currentTargetAncestorIndex >= 0) {
int pathWidth = targetStack.get(currentTargetAncestorIndex).getUserData(Common.ChildId)
- sourceStack.get(currentSourceAncestorIndex).getUserData(Common.ChildId);
if (pathWidth > m_CommandLineValues.MaxPathWidth) {
return Common.EmptyString;
}
}
for (int i = 0; i < sourceStack.size() - commonPrefix; i++) {
Node currentNode = sourceStack.get(i);
String childId = Common.EmptyString;
String parentRawType = currentNode.getParentNode().getUserData(Common.PropertyKey).getRawType();
if (i == 0 || s_ParentTypeToAddChildId.contains(parentRawType)) {
childId = saturateChildId(currentNode.getUserData(Common.ChildId))
.toString();
}
stringBuilder.add(String.format("%s%s%s",
currentNode.getUserData(Common.PropertyKey).getType(true), childId, upSymbol));
}
Node commonNode = sourceStack.get(sourceStack.size() - commonPrefix);
String commonNodeChildId = Common.EmptyString;
Property parentNodeProperty = commonNode.getParentNode().getUserData(Common.PropertyKey);
String commonNodeParentRawType = Common.EmptyString;
if (parentNodeProperty != null) {
commonNodeParentRawType = parentNodeProperty.getRawType();
}
if (s_ParentTypeToAddChildId.contains(commonNodeParentRawType)) {
commonNodeChildId = saturateChildId(commonNode.getUserData(Common.ChildId))
.toString();
}
stringBuilder.add(String.format("%s%s",
commonNode.getUserData(Common.PropertyKey).getType(true), commonNodeChildId));
for (int i = targetStack.size() - commonPrefix - 1; i >= 0; i--) {
Node currentNode = targetStack.get(i);
String childId = Common.EmptyString;
if (i == 0 || s_ParentTypeToAddChildId.contains(currentNode.getUserData(Common.PropertyKey).getRawType())) {
childId = saturateChildId(currentNode.getUserData(Common.ChildId))
.toString();
}
stringBuilder.add(String.format("%s%s%s", downSymbol,
currentNode.getUserData(Common.PropertyKey).getType(true), childId));
}
return stringBuilder.toString();
}
private Integer saturateChildId(int childId) {
return Math.min(childId, m_CommandLineValues.MaxChildId);
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java
================================================
package JavaExtractor.FeaturesEntities;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.stream.Collectors;
public class ProgramFeatures {
String name;
transient ArrayList<ProgramRelation> features = new ArrayList<>();
String textContent;
String filePath;
public ProgramFeatures(String name, Path filePath, String textContent) {
this.name = name;
this.filePath = filePath.toAbsolutePath().toString();
this.textContent = textContent;
}
@SuppressWarnings("StringBufferReplaceableByString")
@Override
public String toString() {
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append(name).append(" ");
stringBuilder.append(features.stream().map(ProgramRelation::toString).collect(Collectors.joining(" ")));
return stringBuilder.toString();
}
public void addFeature(Property source, String path, Property target) {
ProgramRelation newRelation = new ProgramRelation(source, target, path);
features.add(newRelation);
}
public boolean isEmpty() {
return features.isEmpty();
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java
================================================
package JavaExtractor.FeaturesEntities;
public class ProgramRelation {
Property source;
Property target;
String path;
public ProgramRelation(Property sourceName, Property targetName, String path) {
source = sourceName;
target = targetName;
this.path = path;
}
public String toString() {
return String.format("%s,%s,%s", source.getName(), path,
target.getName());
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/Property.java
================================================
package JavaExtractor.FeaturesEntities;
import JavaExtractor.Common.Common;
import com.github.javaparser.ast.Node;
import com.github.javaparser.ast.expr.AssignExpr;
import com.github.javaparser.ast.expr.BinaryExpr;
import com.github.javaparser.ast.expr.IntegerLiteralExpr;
import com.github.javaparser.ast.expr.UnaryExpr;
import com.github.javaparser.ast.type.ClassOrInterfaceType;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class Property {
public static final HashSet<String> NumericalKeepValues = Stream.of("0", "1", "32", "64")
.collect(Collectors.toCollection(HashSet::new));
private static final Map<String, String> shortTypes = Collections.unmodifiableMap(new HashMap<String, String>() {
/**
*
*/
private static final long serialVersionUID = 1L;
{
put("ArrayAccessExpr", "ArAc");
put("ArrayBracketPair", "ArBr");
put("ArrayCreationExpr", "ArCr");
put("ArrayCreationLevel", "ArCrLvl");
put("ArrayInitializerExpr", "ArIn");
put("ArrayType", "ArTy");
put("AssertStmt", "Asrt");
put("AssignExpr:and", "AsAn");
put("AssignExpr:assign", "As");
put("AssignExpr:lShift", "AsLS");
put("AssignExpr:minus", "AsMi");
put("AssignExpr:or", "AsOr");
put("AssignExpr:plus", "AsP");
put("AssignExpr:rem", "AsRe");
put("AssignExpr:rSignedShift", "AsRSS");
put("AssignExpr:rUnsignedShift", "AsRUS");
put("AssignExpr:slash", "AsSl");
put("AssignExpr:star", "AsSt");
put("AssignExpr:xor", "AsX");
put("BinaryExpr:and", "And");
put("BinaryExpr:binAnd", "BinAnd");
put("BinaryExpr:binOr", "BinOr");
put("BinaryExpr:divide", "Div");
put("BinaryExpr:equals", "Eq");
put("BinaryExpr:greater", "Gt");
put("BinaryExpr:greaterEquals", "Geq");
put("BinaryExpr:less", "Ls");
put("BinaryExpr:lessEquals", "Leq");
put("BinaryExpr:lShift", "LS");
put("BinaryExpr:minus", "Minus");
put("BinaryExpr:notEquals", "Neq");
put("BinaryExpr:or", "Or");
put("BinaryExpr:plus", "Plus");
put("BinaryExpr:remainder", "Mod");
put("BinaryExpr:rSignedShift", "RSS");
put("BinaryExpr:rUnsignedShift", "RUS");
put("BinaryExpr:times", "Mul");
put("BinaryExpr:xor", "Xor");
put("BlockStmt", "Bk");
put("BooleanLiteralExpr", "BoolEx");
put("CastExpr", "Cast");
put("CatchClause", "Catch");
put("CharLiteralExpr", "CharEx");
put("ClassExpr", "ClsEx");
put("ClassOrInterfaceDeclaration", "ClsD");
put("ClassOrInterfaceType", "Cls");
put("ConditionalExpr", "Cond");
put("ConstructorDeclaration", "Ctor");
put("DoStmt", "Do");
put("DoubleLiteralExpr", "Dbl");
put("EmptyMemberDeclaration", "Emp");
put("EnclosedExpr", "Enc");
put("ExplicitConstructorInvocationStmt", "ExpCtor");
put("ExpressionStmt", "Ex");
put("FieldAccessExpr", "Fld");
put("FieldDeclaration", "FldDec");
put("ForeachStmt", "Foreach");
put("ForStmt", "For");
put("IfStmt", "If");
put("InitializerDeclaration", "Init");
put("InstanceOfExpr", "InstanceOf");
put("IntegerLiteralExpr", "IntEx");
put("IntegerLiteralMinValueExpr", "IntMinEx");
put("LabeledStmt", "Labeled");
put("LambdaExpr", "Lambda");
put("LongLiteralExpr", "LongEx");
put("MarkerAnnotationExpr", "MarkerExpr");
put("MemberValuePair", "Mvp");
put("MethodCallExpr", "Cal");
put("MethodDeclaration", "Mth");
put("MethodReferenceExpr", "MethRef");
put("NameExpr", "Nm");
put("NormalAnnotationExpr", "NormEx");
put("NullLiteralExpr", "Null");
put("ObjectCreationExpr", "ObjEx");
put("Parameter", "Prm");
put("PrimitiveType", "Prim");
put("QualifiedNameExpr", "Qua");
put("ReturnStmt", "Ret");
put("SingleMemberAnnotationExpr", "SMEx");
put("StringLiteralExpr", "StrEx");
put("SuperExpr", "SupEx");
put("SwitchEntryStmt", "SwiEnt");
put("SwitchStmt", "Switch");
put("SynchronizedStmt", "Sync");
put("ThisExpr", "This");
put("ThrowStmt", "Thro");
put("TryStmt", "Try");
put("TypeDeclarationStmt", "TypeDec");
put("TypeExpr", "Type");
put("TypeParameter", "TypePar");
put("UnaryExpr:inverse", "Inverse");
put("UnaryExpr:negative", "Neg");
put("UnaryExpr:not", "Not");
put("UnaryExpr:posDecrement", "PosDec");
put("UnaryExpr:posIncrement", "PosInc");
put("UnaryExpr:positive", "Pos");
put("UnaryExpr:preDecrement", "PreDec");
put("UnaryExpr:preIncrement", "PreInc");
put("UnionType", "Unio");
put("VariableDeclarationExpr", "VDE");
put("VariableDeclarator", "VD");
put("VariableDeclaratorId", "VDID");
put("VoidType", "Void");
put("WhileStmt", "While");
put("WildcardType", "Wild");
}
});
private final String RawType;
private String Type;
private String SplitName;
public Property(Node node, boolean isLeaf, boolean isGenericParent) {
Class<?> nodeClass = node.getClass();
RawType = Type = nodeClass.getSimpleName();
if (node instanceof ClassOrInterfaceType && ((ClassOrInterfaceType) node).isBoxedType()) {
Type = "PrimitiveType";
}
String operator = "";
if (node instanceof BinaryExpr) {
operator = ((BinaryExpr) node).getOperator().toString();
} else if (node instanceof UnaryExpr) {
operator = ((UnaryExpr) node).getOperator().toString();
} else if (node instanceof AssignExpr) {
operator = ((AssignExpr) node).getOperator().toString();
}
if (operator.length() > 0) {
Type += ":" + operator;
}
String nameToSplit = node.toString();
if (isGenericParent) {
nameToSplit = ((ClassOrInterfaceType) node).getName();
if (isLeaf) {
// if it is a generic parent which counts as a leaf, then when
// it is participating in a path
// as a parent, it should be GenericClass and not a simple
// ClassOrInterfaceType.
Type = "GenericClass";
}
}
ArrayList<String> splitNameParts = Common.splitToSubtokens(nameToSplit);
SplitName = String.join(Common.internalSeparator, splitNameParts);
String name = Common.normalizeName(node.toString(), Common.BlankWord);
if (name.length() > Common.c_MaxLabelLength) {
name = name.substring(0, Common.c_MaxLabelLength);
} else if (node instanceof ClassOrInterfaceType && ((ClassOrInterfaceType) node).isBoxedType()) {
name = ((ClassOrInterfaceType) node).toUnboxedType().toString();
}
if (Common.isMethod(node, Type)) {
name = SplitName = Common.methodName;
}
if (SplitName.length() == 0) {
SplitName = name;
if (node instanceof IntegerLiteralExpr && !NumericalKeepValues.contains(SplitName)) {
// This is a numeric literal, but not in our white list
SplitName = "<NUM>";
}
}
}
public String getRawType() {
return RawType;
}
public String getType() {
return Type;
}
public String getType(boolean shorten) {
if (shorten) {
return shortTypes.getOrDefault(Type, Type);
} else {
return Type;
}
}
public String getName() {
return SplitName;
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java
================================================
package JavaExtractor.Visitors;
import JavaExtractor.Common.CommandLineValues;
import JavaExtractor.Common.Common;
import JavaExtractor.Common.MethodContent;
import com.github.javaparser.ast.Node;
import com.github.javaparser.ast.body.MethodDeclaration;
import com.github.javaparser.ast.visitor.VoidVisitorAdapter;
import java.util.ArrayList;
import java.util.Arrays;
@SuppressWarnings("StringEquality")
public class FunctionVisitor extends VoidVisitorAdapter<Object> {
private final ArrayList<MethodContent> methods = new ArrayList<>();
private final CommandLineValues commandLineValues;
public FunctionVisitor(CommandLineValues commandLineValues) {
this.commandLineValues = commandLineValues;
}
@Override
public void visit(MethodDeclaration node, Object arg) {
visitMethod(node);
super.visit(node, arg);
}
private void visitMethod(MethodDeclaration node) {
LeavesCollectorVisitor leavesCollectorVisitor = new LeavesCollectorVisitor();
leavesCollectorVisitor.visitDepthFirst(node);
ArrayList<Node> leaves = leavesCollectorVisitor.getLeaves();
String normalizedMethodName = Common.normalizeName(node.getName(), Common.BlankWord);
ArrayList<String> splitNameParts = Common.splitToSubtokens(node.getName());
String splitName = normalizedMethodName;
if (splitNameParts.size() > 0) {
splitName = String.join(Common.internalSeparator, splitNameParts);
}
node.setName(Common.methodName);
if (node.getBody() != null) {
long methodLength = getMethodLength(node.getBody().toString());
if (commandLineValues.MaxCodeLength <= 0 ||
(methodLength >= commandLineValues.MinCodeLength && methodLength <= commandLineValues.MaxCodeLength)) {
methods.add(new MethodContent(leaves, splitName, node.toString()));
}
}
}
private long getMethodLength(String code) {
String cleanCode = code.replaceAll("\r\n", "\n").replaceAll("\t", " ");
if (cleanCode.startsWith("{\n"))
cleanCode = cleanCode.substring(3).trim();
if (cleanCode.endsWith("\n}"))
cleanCode = cleanCode.substring(0, cleanCode.length() - 2).trim();
if (cleanCode.length() == 0) {
return 0;
}
return Arrays.stream(cleanCode.split("\n"))
.filter(line -> (line.trim() != "{" && line.trim() != "}" && line.trim() != ""))
.filter(line -> !line.trim().startsWith("/") && !line.trim().startsWith("*")).count();
}
public ArrayList<MethodContent> getMethodContents() {
return methods;
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java
================================================
package JavaExtractor.Visitors;
import JavaExtractor.Common.Common;
import JavaExtractor.FeaturesEntities.Property;
import com.github.javaparser.ast.Node;
import com.github.javaparser.ast.comments.Comment;
import com.github.javaparser.ast.expr.NullLiteralExpr;
import com.github.javaparser.ast.stmt.Statement;
import com.github.javaparser.ast.type.ClassOrInterfaceType;
import com.github.javaparser.ast.visitor.TreeVisitor;
import java.util.ArrayList;
import java.util.List;
public class LeavesCollectorVisitor extends TreeVisitor {
private final ArrayList<Node> m_Leaves = new ArrayList<>();
@Override
public void process(Node node) {
if (node instanceof Comment) {
return;
}
boolean isLeaf = false;
boolean isGenericParent = isGenericParent(node);
if (hasNoChildren(node) && isNotComment(node)) {
if (!node.toString().isEmpty() && (!"null".equals(node.toString()) || (node instanceof NullLiteralExpr))) {
m_Leaves.add(node);
isLeaf = true;
}
}
int childId = getChildId(node);
node.setUserData(Common.ChildId, childId);
Property property = new Property(node, isLeaf, isGenericParent);
node.setUserData(Common.PropertyKey, property);
}
private boolean isGenericParent(Node node) {
return (node instanceof ClassOrInterfaceType)
&& ((ClassOrInterfaceType) node).getTypeArguments() != null
&& ((ClassOrInterfaceType) node).getTypeArguments().size() > 0;
}
private boolean hasNoChildren(Node node) {
return node.getChildrenNodes().size() == 0;
}
private boolean isNotComment(Node node) {
return !(node instanceof Comment) && !(node instanceof Statement);
}
public ArrayList<Node> getLeaves() {
return m_Leaves;
}
private int getChildId(Node node) {
Node parent = node.getParentNode();
List<Node> parentsChildren = parent.getChildrenNodes();
int childId = 0;
for (Node child : parentsChildren) {
if (child.getRange().equals(node.getRange())) {
return childId;
}
childId++;
}
return childId;
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/Test.java
================================================
class Test {
void fooBar() {
System.out.println("http://github.com");
}
}
================================================
FILE: JavaExtractor/extract.py
================================================
#!/usr/bin/python
import itertools
import multiprocessing
import os
import shutil
import subprocess
import sys
from argparse import ArgumentParser
from threading import Timer
def get_immediate_subdirectories(a_dir):
return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir)
if os.path.isdir(os.path.join(a_dir, name))]
TMP_DIR = ""
def ParallelExtractDir(args, dir):
ExtractFeaturesForDir(args, dir, "")
def ExtractFeaturesForDir(args, dir, prefix):
command = ['java', '-Xmx100g', '-XX:MaxNewSize=60g', '-cp', args.jar, 'JavaExtractor.App',
'--max_path_length', str(args.max_path_length), '--max_path_width', str(args.max_path_width),
'--dir', dir, '--num_threads', str(args.num_threads)]
# print command
# os.system(command)
kill = lambda process: process.kill()
outputFileName = TMP_DIR + prefix + dir.split('/')[-1]
failed = False
with open(outputFileName, 'a') as outputFile:
sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE)
timer = Timer(60 * 60, kill, [sleeper])
try:
timer.start()
stdout, stderr = sleeper.communicate()
finally:
timer.cancel()
if sleeper.poll() == 0:
if len(stderr) > 0:
print(stderr, file=sys.stderr)
else:
print('dir: ' + str(dir) + ' was not completed in time', file=sys.stderr)
failed = True
subdirs = get_immediate_subdirectories(dir)
for subdir in subdirs:
ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')
if failed:
if os.path.exists(outputFileName):
os.remove(outputFileName)
def ExtractFeaturesForDirsList(args, dirs):
global TMP_DIR
TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid())
if os.path.exists(TMP_DIR):
shutil.rmtree(TMP_DIR, ignore_errors=True)
os.makedirs(TMP_DIR)
try:
p = multiprocessing.Pool(6)
p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs))
# for dir in dirs:
# ExtractFeaturesForDir(args, dir, '')
output_files = os.listdir(TMP_DIR)
for f in output_files:
os.system("cat %s/%s" % (TMP_DIR, f))
finally:
shutil.rmtree(TMP_DIR, ignore_errors=True)
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8)
parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2)
parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64)
parser.add_argument("-j", "--jar", dest="jar", required=True)
parser.add_argument("-dir", "--dir", dest="dir", required=False)
parser.add_argument("-file", "--file", dest="file", required=False)
args = parser.parse_args()
if args.file is not None:
command = 'java -cp ' + args.jar + ' JavaExtractor.App --max_path_length ' + \
str(args.max_path_length) + ' --max_path_width ' + str(args.max_path_width) + ' --file ' + args.file
os.system(command)
elif args.dir is not None:
subdirs = get_immediate_subdirectories(args.dir)
if len(subdirs) == 0:
subdirs = [args.dir]
ExtractFeaturesForDirsList(args, subdirs)
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2019 Technion
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: Python150kExtractor/README.md
================================================
# Python150k dataset
## Steps to reproduce
1. Download parsed python dataset from [here](https://www.sri.inf.ethz.ch/py150
), unarchive and place under `PYTHON150K_DIR`:
```bash
# Replace with desired path.
>>> PYTHON150K_DIR=/path/to/data/dir
>>> mkdir -p $PYTHON150K_DIR
>>> cd $PYTHON150K_DIR
>>> wget http://files.srl.inf.ethz.ch/data/py150.tar.gz
...
>>> tar -xzvf py150.tar.gz
...
```
2. Extract samples to `DATA_DIR`:
```bash
# Replace with desired path.
>>> DATA_DIR=$(pwd)/data/default
>>> SEED=239
>>> python extract.py \
--data_dir=$PYTHON150K_DIR \
--output_dir=$DATA_DIR \
--seed=$SEED
...
```
3. Preprocess for training:
```bash
>>> ./preprocess.sh $DATA_DIR
...
```
4. Train:
```bash
>>> cd ..
>>> DESC=default
>>> CUDA=0
>>> ./train_python150k.sh $DATA_DIR $DESC $CUDA $SEED
...
```
## Test results (seed=239)
### Best scores
**setup#2**: `batch_size=64`
**setup#3**: `embedding_size=256,use_momentum=False`
**setup#4**: `batch_size=32,embedding_size=256,embeddings_dropout_keep_prob=0.5,use_momentum=False`
| params | Precision | Recall | F1 | ROUGE-2 | ROUGE-L |
|---|---|---|---|---|---|
| default | 0.37 | 0.27 | 0.31 | 0.06 | 0.38 |
| setup#2 | 0.40 | 0.31 | 0.34 | 0.08 | 0.41 |
| setup#3 | 0.36 | 0.31 | 0.33 | 0.09 | 0.38 |
| setup#4 | 0.33 | 0.25 | 0.28 | 0.05 | 0.34 |
### Ablation studies
| params | Precision | Recall | F1 | ROUGE-2 | ROUGE-L |
|---|---|---|---|---|---|
| default | 0.37 | 0.27 | 0.31 | 0.06 | 0.38 |
| no ast nodes (5th epoch) | 0.27 | 0.16 | 0.20 | 0.02 | 0.28 |
| no token split (4th epoch) | 0.60 | 0.09 | 0.15 | 0.00 | 0.60 |
================================================
FILE: Python150kExtractor/extract.py
================================================
import argparse
import re
import json
import multiprocessing
import itertools
import tqdm
import joblib
import numpy as np
from pathlib import Path
from sklearn import model_selection as sklearn_model_selection
METHOD_NAME, NUM = 'METHODNAME', 'NUM'
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', required=True, type=str)
parser.add_argument('--valid_p', type=float, default=0.2)
parser.add_argument('--max_path_length', type=int, default=8)
parser.add_argument('--max_path_width', type=int, default=2)
parser.add_argument('--use_method_name', type=bool, default=True)
parser.add_argument('--use_nums', type=bool, default=True)
parser.add_argument('--output_dir', required=True, type=str)
parser.add_argument('--n_jobs', type=int, default=multiprocessing.cpu_count())
parser.add_argument('--seed', type=int, default=239)
def __collect_asts(json_file):
with open(json_file, 'r', encoding='utf-8') as f:
for line in tqdm.tqdm(f):
yield line
def __terminals(ast, node_index, args):
stack, paths = [], []
def dfs(v):
stack.append(v)
v_node = ast[v]
if 'value' in v_node:
if v == node_index: # Top-level func def node.
if args.use_method_name:
paths.append((stack.copy(), METHOD_NAME))
else:
v_type = v_node['type']
if v_type.startswith('Name'):
paths.append((stack.copy(), v_node['value']))
elif args.use_nums and v_type == 'Num':
paths.append((stack.copy(), NUM))
else:
pass
if 'children' in v_node:
for child in v_node['children']:
dfs(child)
stack.pop()
dfs(node_index)
return paths
def __merge_terminals2_paths(v_path, u_path):
s, n, m = 0, len(v_path), len(u_path)
while s < min(n, m) and v_path[s] == u_path[s]:
s += 1
prefix = list(reversed(v_path[s:]))
lca = v_path[s - 1]
suffix = u_path[s:]
return prefix, lca, suffix
def __raw_tree_paths(ast, node_index, args):
tnodes = __terminals(ast, node_index, args)
tree_paths = []
for (v_path, v_value), (u_path, u_value) in itertools.combinations(
iterable=tnodes,
r=2,
):
prefix, lca, suffix = __merge_terminals2_paths(v_path, u_path)
if (len(prefix) + 1 + len(suffix) <= args.max_path_length) \
and (abs(len(prefix) - len(suffix)) <= args.max_path_width):
path = prefix + [lca] + suffix
tree_path = v_value, path, u_value
tree_paths.append(tree_path)
return tree_paths
def __delim_name(name):
if name in {METHOD_NAME, NUM}:
return name
def camel_case_split(identifier):
matches = re.finditer(
'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)',
identifier,
)
return [m.group(0) for m in matches]
blocks = []
for underscore_block in name.split('_'):
blocks.extend(camel_case_split(underscore_block))
return '|'.join(block.lower() for block in blocks)
def __collect_sample(ast, fd_index, args):
root = ast[fd_index]
if root['type'] != 'FunctionDef':
raise ValueError('Wrong node type.')
target = root['value']
tree_paths = __raw_tree_paths(ast, fd_index, args)
contexts = []
for tree_path in tree_paths:
start, connector, finish = tree_path
start, finish = __delim_name(start), __delim_name(finish)
connector = '|'.join(ast[v]['type'] for v in connector)
context = f'{start},{connector},{finish}'
contexts.append(context)
if len(contexts) == 0:
return None
target = __delim_name(target)
context = ' '.join(contexts)
return f'{target} {context}'
def __collect_samples(ast, args):
samples = []
for node_index, node in enumerate(ast):
if node['type'] == 'FunctionDef':
sample = __collect_sample(ast, node_index, args)
if sample is not None:
samples.append(sample)
return samples
def __collect_all_and_save(asts, args, output_file):
parallel = joblib.Parallel(n_jobs=args.n_jobs)
func = joblib.delayed(__collect_samples)
samples = parallel(func(ast, args) for ast in tqdm.tqdm(asts))
samples = list(itertools.chain.from_iterable(samples))
with open(output_file, 'w') as f:
for line_index, line in enumerate(samples):
f.write(line + ('' if line_index == len(samples) - 1 else '\n'))
def main():
args = parser.parse_args()
np.random.seed(args.seed)
data_dir = Path(args.data_dir)
trains = list(__collect_asts(data_dir / 'python100k_train.json'))
evals = list(__collect_asts(data_dir / 'python50k_eval.json'))
train, valid = sklearn_model_selection.train_test_split(
trains,
test_size=args.valid_p,
)
test = evals
output_dir = Path(args.output_dir)
output_dir.mkdir(exist_ok=True)
for split_name, split in zip(
('train', 'valid', 'test'),
(train, valid, test),
):
output_file = output_dir / f'{split_name}_output_file.txt'
__collect_all_and_save((json.loads(line) for line in split), args, output_file)
if __name__ == '__main__':
main()
================================================
FILE: Python150kExtractor/preprocess.sh
================================================
#!/usr/bin/env bash
MAX_CONTEXTS=200
MAX_DATA_CONTEXTS=1000
SUBTOKEN_VOCAB_SIZE=186277
TARGET_VOCAB_SIZE=26347
data_dir=${1:-data}
mkdir -p "${data_dir}"
train_data_file=$data_dir/train_output_file.txt
valid_data_file=$data_dir/valid_output_file.txt
test_data_file=$data_dir/test_output_file.txt
echo "Creating histograms from the training data..."
target_histogram_file=$data_dir/histo.tgt.c2s
source_subtoken_histogram=$data_dir/histo.ori.c2s
node_histogram_file=$data_dir/histo.node.c2s
cut <"${train_data_file}" -d' ' -f1 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' >"${target_histogram_file}"
cut <"${train_data_file}" -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' >"${source_subtoken_histogram}"
cut <"${train_data_file}" -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' >"${node_histogram_file}"
echo "Preprocessing..."
python ../preprocess.py \
--train_data "${train_data_file}" \
--val_data "${valid_data_file}" \
--test_data "${test_data_file}" \
--max_contexts ${MAX_CONTEXTS} \
--max_data_contexts ${MAX_DATA_CONTEXTS} \
--subtoken_vocab_size ${SUBTOKEN_VOCAB_SIZE} \
--target_vocab_size ${TARGET_VOCAB_SIZE} \
--target_histogram "${target_histogram_file}" \
--subtoken_histogram "${source_subtoken_histogram}" \
--node_histogram "${node_histogram_file}" \
--output_name "${data_dir}"/"$(basename "${data_dir}")"
rm \
"${target_histogram_file}" \
"${source_subtoken_histogram}" \
"${node_histogram_file}"
================================================
FILE: README.md
================================================
# code2seq
This is an official implementation of the model described in:
[Uri Alon](http://urialon.cswp.cs.technion.ac.il), [Shaked Brody](http://www.cs.technion.ac.il/people/shakedbr/), [Omer Levy](https://levyomer.wordpress.com) and [Eran Yahav](http://www.cs.technion.ac.il/~yahave/), "code2seq: Generating Sequences from Structured Representations of Code" [[PDF]](https://openreview.net/pdf?id=H1gKYo09tX)
Appeared in **ICLR'2019** (**poster** available [here](https://urialon.cswp.cs.technion.ac.il/wp-content/uploads/sites/83/2019/05/ICLR19_poster_code2seq.pdf))
An **online demo** is available at [https://code2seq.org](https://code2seq.org).
This is a TensorFlow implementation of the network, with Java and C# extractors for preprocessing the input code.
It can be easily extended to other languages,
since the TensorFlow network is agnostic to the input programming language (see [Extending to other languages](#extending-to-other-languages).
Contributions are welcome.
<center style="padding: 40px"><img width="70%" src="https://github.com/tech-srl/code2seq/raw/master/images/network.png" /></center>
## See also:
* **Structural Language Models for Code** (ICML'2020) is a new paper that learns to generate the missing code within a larger code snippet. This is similar to code completion, but is able to predict complex expressions rather than a single token at a time. See [PDF](https://arxiv.org/pdf/1910.00577.pdf), demo at [http://AnyCodeGen.org](http://AnyCodeGen.org).
* **Adversarial Examples for Models of Code** is a new paper that shows how to slightly mutate the input code snippet of code2vec and GNNs models (thus, introducing adversarial examples), such that the model (code2vec or GNNs) will output a prediction of our choice. See [PDF](https://arxiv.org/pdf/1910.07517.pdf) (code: soon).
* **Neural Reverse Engineering of Stripped Binaries** is a new paper that learns to predict procedure names in stripped binaries, thus use neural networks for reverse engineering. See [PDF](https://arxiv.org/pdf/1902.09122) (code: soon).
* **code2vec** (POPL'2019) is our previous model. It can only generate a single label at a time (rather than a sequence as code2seq), but it is much faster to train (because of its simplicity). See [PDF](https://urialon.cswp.cs.technion.ac.il/wp-content/uploads/sites/83/2018/12/code2vec-popl19.pdf), demo at [https://code2vec.org](https://code2vec.org) and [code](https://github.com/tech-srl/code2vec/).
Table of Contents
=================
* [Requirements](#requirements)
* [Quickstart](#quickstart)
* [Configuration](#configuration)
* [Releasing a trained mode](#releasing-a-trained-model)
* [Extending to other languages](#extending-to-other-languages)
* [Datasets](#datasets)
* [Baselines](#baselines)
* [Citation](#citation)
## Requirements
* [python3](https://www.linuxbabe.com/ubuntu/install-python-3-6-ubuntu-16-04-16-10-17-04)
* TensorFlow 1.12 ([install](https://www.tensorflow.org/install/install_linux)). To check TensorFlow version:
> python3 -c 'import tensorflow as tf; print(tf.\_\_version\_\_)'
- For a TensorFlow 2.1 implementation by [@Kolkir](https://github.com/Kolkir/), see: [https://github.com/Kolkir/code2seq](https://github.com/Kolkir/code2seq)
* For [creating a new Java dataset](#creating-and-preprocessing-a-new-java-dataset) or [manually examining a trained model](#step-4-manual-examination-of-a-trained-model) (any operation that requires parsing of a new code example): [JDK](https://openjdk.java.net/install/)
* For creating a C# dataset: [dotnet-core](https://dotnet.microsoft.com/download) version 2.2 or newer.
* `pip install rouge` for computing rouge scores.
## Quickstart
### Step 0: Cloning this repository
```
git clone https://github.com/tech-srl/code2seq
cd code2seq
```
### Step 1: Creating a new dataset from Java sources
To obtain a preprocessed dataset to train a network on, you can either download our
preprocessed dataset, or create a new dataset from Java source files.
#### Download our preprocessed dataset Java-large dataset (~16M examples, compressed: 11G, extracted 125GB)
```
mkdir data
cd data
wget https://s3.amazonaws.com/code2seq/datasets/java-large-preprocessed.tar.gz
tar -xvzf java-large-preprocessed.tar.gz
```
This will create a `data/java-large/` sub-directory, containing the files that hold training, test and validation sets,
and a dict file for various dataset properties.
#### Creating and preprocessing a new Java dataset
To create and preprocess a new dataset (for example, to compare code2seq to another model on another dataset):
* Edit the file [preprocess.sh](preprocess.sh) using the instructions there, pointing it to the correct training, validation and test directories.
* Run the preprocess.sh file:
> bash preprocess.sh
### Step 2: Training a model
You can either download an already trained model, or train a new model using a preprocessed dataset.
#### Downloading a trained model (137 MB)
We already trained a model for 52 epochs on the data that was preprocessed in the previous step. This model is the same model that was
used in the paper and the same model that serves the demo at [code2seq.org](code2seq.org).
```
wget https://s3.amazonaws.com/code2seq/model/java-large/java-large-model.tar.gz
tar -xvzf java-large-model.tar.gz
```
##### Note:
This trained model is in a "released" state, which means that we stripped it from its training parameters.
#### Training a model from scratch
To train a model from scratch:
* Edit the file [train.sh](train.sh) to point it to the right preprocessed data. By default,
it points to our "java-large" dataset that was preprocessed in the previous step.
* Before training, you can edit the configuration hyper-parameters in the file [config.py](config.py),
as explained in [Configuration](#configuration).
* Run the [train.sh](train.sh) script:
```
bash train.sh
```
### Step 3: Evaluating a trained model
After `config.PATIENCE` iterations of no improvement on the validation set, training stops by itself.
Suppose that iteration #52 is our chosen model, run:
```
python3 code2seq.py --load models/java-large-model/model_iter52.release --test data/java-large/java-large.test.c2s
```
While evaluating, a file named "log.txt" is written to the same dir as the saved models, with each test example name and the model's prediction.
### Step 4: Manual examination of a trained model
To manually examine a trained model, run:
```
python3 code2seq.py --load models/java-large-model/model_iter52.release --predict
```
After the model loads, follow the instructions and edit the file `Input.java` and enter a Java
method or code snippet, and examine the model's predictions and attention scores.
#### Note:
Due to TensorFlow's limitations, if using beam search (`config.BEAM_WIDTH > 0`), then `BEAM_WIDTH` hypotheses will be printed, but
without attention weights. If not using beam search (`config.BEAM_WIDTH == 0`), then a single hypothesis will be printed *with
the attention weights* in every decoding timestep.
## Configuration
Changing hyper-parameters is possible by editing the file [config.py](config.py).
Here are some of the parameters and their description:
#### config.NUM_EPOCHS = 3000
The max number of epochs to train the model.
#### config.SAVE_EVERY_EPOCHS = 1
The frequency, in epochs, of saving a model and evaluating on the validation set during training.
#### config.PATIENCE = 10
Controlling early stopping: how many epochs of no improvement should training continue before stopping.
#### config.BATCH_SIZE = 512
Batch size during training.
#### config.TEST_BATCH_SIZE = 256
Batch size during evaluation. Affects only the evaluation speed and memory consumption, does not affect the results.
#### config.SHUFFLE_BUFFER_SIZE = 10000
The buffer size that the reader uses for shuffling the training data.
Controls the randomness of the data.
Increasing this value might hurt training throughput.
#### config.CSV_BUFFER_SIZE = 100 * 1024 * 1024
The buffer size (in bytes) of the CSV dataset reader.
#### config.MAX_CONTEXTS = 200
The number of contexts to sample in each example during training
(resampling a different subset of this size every training iteration).
#### config.SUBTOKENS_VOCAB_MAX_SIZE = 190000
The max size of the subtoken vocabulary.
#### config.TARGET_VOCAB_MAX_SIZE = 27000
The max size of the target words vocabulary.
#### config.EMBEDDINGS_SIZE = 128
Embedding size for subtokens, AST nodes and target symbols.
#### config.RNN_SIZE = 128 * 2
The total size of the two LSTMs that are used to embed the paths if `config.BIRNN` is `True`, or the size of the single LSTM if `config.BIRNN` is `False`.
#### config.DECODER_SIZE = 320
Size of each LSTM layer in the decoder.
#### config.NUM_DECODER_LAYERS = 1
Number of decoder LSTM layers. Can be increased to support long target sequences.
#### config.MAX_PATH_LENGTH = 8 + 1
The max number of nodes in a path
#### config.MAX_NAME_PARTS = 5
The max number of subtokens in an input token. If the token is longer, only the first subtokens will be read.
#### config.MAX_TARGET_PARTS = 6
The max number of symbols in the target sequence.
Set to 6 by default for method names, but can be increased for learning datasets with longer sequences.
### config.BIRNN = True
If True, use a bidirectional LSTM to encode each path. If False, use a unidirectional LSTM only.
#### config.RANDOM_CONTEXTS = True
When True, sample `MAX_CONTEXT` from every example every training iteration.
When False, take the first `MAX_CONTEXTS` only.
#### config.BEAM_WIDTH = 0
Beam width in beam search. Inactive when 0.
#### config.USE_MOMENTUM = True
If `True`, use Momentum optimizer with nesterov. If `False`, use Adam
(Adam converges in fewer epochs; Momentum leads to slightly better results).
## Releasing a trained model
If you wish to keep a trained model for inference only (without the ability to continue training it) you can
release the model using:
```
python3 code2seq.py --load models/java-large-model/model_iter52 --release
```
This will save a copy of the trained model with the '.release' suffix.
A "released" model usually takes ~3x less disk space.
## Extending to other languages
This project currently supports Java and C\# as the input languages.
_**March 2020** - a code2seq extractor for **C++** based on LLVM was developed by [@Kolkir](https://github.com/Kolkir/) and is available here: [https://github.com/Kolkir/cppminer](https://github.com/Kolkir/cppminer)._
_**January 2020** - a code2seq extractor for Python (specifically targeting the Python150k dataset) was contributed by [@stasbel](https://github.com/stasbel). See: [https://github.com/tech-srl/code2seq/tree/master/Python150kExtractor](https://github.com/tech-srl/code2seq/tree/master/Python150kExtractor)._
_**January 2020** - an extractor for predicting TypeScript type annotations for JavaScript input using code2vec was developed by [@izosak](https://github.com/izosak) and Noa Cohen, and is available here:
[https://github.com/tech-srl/id2vec](https://github.com/tech-srl/id2vec)._
~~_**June 2019** - an extractor for **C** that is compatible with our model was developed by [CMU SEI team](https://github.com/cmu-sei/code2vec-c)._~~ - removed by CMU SEI team.
_**June 2019** - a code2vec extractor for **Python, Java, C, C++** by JetBrains Research is available here: [PathMiner](https://github.com/JetBrains-Research/astminer)._
To extend code2seq to other languages other than Java and C#, a new extractor (similar to the [JavaExtractor](JavaExtractor))
should be implemented, and be called by [preprocess.sh](preprocess.sh).
Basically, an extractor should be able to output for each directory containing source files:
* A single text file, where each row is an example.
* Each example is a space-delimited list of fields, where:
1. The first field is the target label, internally delimited by the "|" character (for example: `compare|ignore|case`)
2. Each of the following field are contexts, where each context has three components separated by commas (","). None of these components can include spaces nor commas.
We refer to these three components as a token, a path, and another token, but in general other types of ternary contexts can be considered.
Each "token" component is a token in the code, split to subtokens using the "|" character.
Each path is a path between two tokens, split to path nodes (or other kinds of building blocks) using the "|" character.
Example for a context:
`my|key,StringExression|MethodCall|Name,get|value`
Here `my|key` and `get|value` are tokens, and `StringExression|MethodCall|Name` is the syntactic path that connects them.
## Datasets
### Java
To download the Java-small, Java-med and Java-large datasets used in the Code Summarization task as raw `*.java` files, use:
* [Java-small](https://s3.amazonaws.com/code2seq/datasets/java-small.tar.gz)
* [Java-med](https://s3.amazonaws.com/code2seq/datasets/java-med.tar.gz)
* [Java-large](https://s3.amazonaws.com/code2seq/datasets/java-large.tar.gz)
To download the preprocessed datasets, use:
* [Java-small-preprocessed](https://s3.amazonaws.com/code2seq/datasets/java-small-preprocessed.tar.gz)
* [Java-med-preprocessed](https://s3.amazonaws.com/code2seq/datasets/java-med-preprocessed.tar.gz)
* [Java-large-preprocessed](https://s3.amazonaws.com/code2seq/datasets/java-large-preprocessed.tar.gz)
### C#
The C# dataset used in the Code Captioning task can be downloaded from the [CodeNN](https://github.com/sriniiyer/codenn/) repository.
## Baselines
### Using the trained model
For the NMT baselines (BiLSTM, Transformer) we used the implementation of [OpenNMT-py](http://opennmt.net/OpenNMT-py/).
The trained BiLSTM model is available here:
`https://code2seq.s3.amazonaws.com/lstm_baseline/model_acc_62.88_ppl_12.03_e16.pt`
Test+validation sources and targets:
```
https://code2seq.s3.amazonaws.com/lstm_baseline/test_expected_actual.txt
https://code2seq.s3.amazonaws.com/lstm_baseline/test_source.txt
https://code2seq.s3.amazonaws.com/lstm_baseline/test_target.txt
https://code2seq.s3.amazonaws.com/lstm_baseline/val_source.txt
https://code2seq.s3.amazonaws.com/lstm_baseline/val_target.txt
```
The command line for "translating" a "source" file to a "target" is:
`python3 translate.py -model model_acc_62.88_ppl_12.03_e16.pt -src test_source.txt -output translation_epoch16.txt -gpu 0`
This results in a `translation_epoch16.txt` which we compare to `test_target.txt` to compute the score.
The file `test_expected_actual.txt` is a line-by-line concatenation of the true reference ("expected") with the corresponding prediction (the "actual").
### Creating data for the baseline
We first modified the JavaExtractor (the same one as in this) to locate the methods to train on and print them to a file where each method is a single line. This modification is currently not checked in, but instead of extracting paths, it just prints `node.toString()` and replaces "\n" with space, where `node` is the object holding the AST node of type `MethodDeclaration`.
Then, we tokenized (including sub-tokenization of identifiers, i.e., `"ArrayList"-> ["Array","List"])` each method body using `javalang`, using [this](baseline_tokenization/subtokenize_nmt_baseline.py) script (which can be run on [this](baseline_tokenization/input_example.txt) input example).
So a program of:
```
void methodName(String fooBar) {
System.out.println("hello world");
}
```
should be printed by the modified JavaExtractor as:
```method name|void (String fooBar){ System.out.println("hello world");}```
and the tokenization script would turn it into:
```void ( String foo Bar ) { System . out . println ( " hello world " ) ; }```
and the label to be predicted, i.e., "method name", into a separate file.
OpenNMT-py can then be trained over these training source and target files.
## Citation
[code2seq: Generating Sequences from Structured Representations of Code](https://arxiv.org/pdf/1808.01400)
```
@inproceedings{
alon2018codeseq,
title={code2seq: Generating Sequences from Structured Representations of Code},
author={Uri Alon and Shaked Brody and Omer Levy and Eran Yahav},
booktitle={International Conference on Learning Representations},
year={2019},
url={https://openreview.net/forum?id=H1gKYo09tX},
}
```
================================================
FILE: __init__.py
================================================
================================================
FILE: baseline_tokenization/input_example.txt
================================================
requires landscape|boolean (){ return false; }
get parent key|Object (){ return new ContactsUiKey(); }
get parent key|Object (){ return new ContactsUiKey(); }
get layout id|int (){ return R.layout.loose_screen; }
get parent key|Object (){ return new EditContactKey(contactId); }
to contact|Contact (){ return new Contact(id, name, email); }
to string|String (){ return "Welcome!\nClick to continue."; }
get parent key|Object (){ return new EditContactKey(contactId); }
tear down services|void (@NonNull Services services){ }
get layout id|int (){ return R.layout.landscape_screen; }
================================================
FILE: baseline_tokenization/javalang/__init__.py
================================================
from . import parser
from . import parse
from . import tokenizer
from . import javadoc
__version__ = "0.10.1"
================================================
FILE: baseline_tokenization/javalang/ast.py
================================================
import pickle
import six
class MetaNode(type):
def __new__(mcs, name, bases, dict):
attrs = list(dict['attrs'])
dict['attrs'] = list()
for base in bases:
if hasattr(base, 'attrs'):
dict['attrs'].extend(base.attrs)
dict['attrs'].extend(attrs)
return type.__new__(mcs, name, bases, dict)
@six.add_metaclass(MetaNode)
class Node(object):
attrs = ()
def __init__(self, **kwargs):
values = kwargs.copy()
for attr_name in self.attrs:
value = values.pop(attr_name, None)
setattr(self, attr_name, value)
if values:
raise ValueError('Extraneous arguments')
def __equals__(self, other):
if type(other) is not type(self):
return False
for attr in self.attrs:
if getattr(other, attr) != getattr(self, attr):
return False
return True
def __repr__(self):
return type(self).__name__
def __iter__(self):
return walk_tree(self)
def filter(self, pattern):
for path, node in self:
if ((isinstance(pattern, type) and isinstance(node, pattern)) or
(node == pattern)):
yield path, node
@property
def children(self):
return [getattr(self, attr_name) for attr_name in self.attrs]
def walk_tree(root):
children = None
if isinstance(root, Node):
yield (), root
children = root.children
else:
children = root
for child in children:
if isinstance(child, (Node, list, tuple)):
for path, node in walk_tree(child):
yield (root,) + path, node
def dump(ast, file):
pickle.dump(ast, file)
def load(file):
return pickle.load(file)
================================================
FILE: baseline_tokenization/javalang/javadoc.py
================================================
import re
def join(s):
return ' '.join(l.strip() for l in s.split('\n'))
class DocBlock(object):
def __init__(self):
self.description = ''
self.return_doc = None
self.params = []
self.authors = []
self.deprecated = False
# @exception and @throw are equivalent
self.throws = {}
self.exceptions = self.throws
self.tags = {}
def add_block(self, name, value):
value = value.strip()
if name == 'param':
try:
param, description = value.split(None, 1)
except ValueError:
param, description = value, ''
self.params.append((param, join(description)))
elif name in ('throws', 'exception'):
try:
ex, description = value.split(None, 1)
except ValueError:
ex, description = value, ''
self.throws[ex] = join(description)
elif name == 'return':
self.return_doc = value
elif name == 'author':
self.authors.append(value)
elif name == 'deprecated':
self.deprecated = True
self.tags.setdefault(name, []).append(value)
blocks_re = re.compile('(^@)', re.MULTILINE)
leading_space_re = re.compile(r'^\s*\*', re.MULTILINE)
blocks_justify_re = re.compile(r'^\s*@', re.MULTILINE)
def _sanitize(s):
s = s.strip()
if not (s[:3] == '/**' and s[-2:] == '*/'):
raise ValueError('not a valid Javadoc comment')
s = s.replace('\t', ' ')
return s
def _uncomment(s):
# Remove /** and */
s = s[3:-2].strip()
return leading_space_re.sub('', s)
def _get_indent_level(s):
return len(s) - len(s.lstrip())
def _left_justify(s):
lines = s.rstrip().splitlines()
if not lines:
return ''
indent_levels = []
for line in lines:
if line.strip():
indent_levels.append(_get_indent_level(line))
indent_levels.sort()
common_indent = indent_levels[0]
if common_indent == 0:
return s
else:
lines = [line[common_indent:] for line in lines]
return '\n'.join(lines)
def _force_blocks_left(s):
return blocks_justify_re.sub('@', s)
def parse(raw):
sanitized = _sanitize(raw)
uncommented = _uncomment(sanitized)
justified = _left_justify(uncommented)
justified_fixed = _force_blocks_left(justified)
prepared = justified_fixed
blocks = blocks_re.split(prepared)
doc = DocBlock()
if blocks[0] != '@':
doc.description = blocks[0].strip()
blocks = blocks[2::2]
else:
blocks = blocks[1::2]
for block in blocks:
try:
tag, value = block.split(None, 1)
except ValueError:
tag, value = block, ''
doc.add_block(tag, value)
return doc
================================================
FILE: baseline_tokenization/javalang/parse.py
================================================
from .parser import Parser
from .tokenizer import tokenize
def parse_expression(exp):
if not exp.endswith(';'):
exp = exp + ';'
tokens = tokenize(exp)
parser = Parser(tokens)
return parser.parse_expression()
def parse_member_signature(sig):
if not sig.endswith(';'):
sig = sig + ';'
tokens = tokenize(sig)
parser = Parser(tokens)
return parser.parse_member_declaration()
def parse_constructor_signature(sig):
# Add an empty body to the signature, replacing a ; if necessary
if sig.endswith(';'):
sig = sig[:-1]
sig = sig + '{ }'
tokens = tokenize(sig)
parser = Parser(tokens)
return parser.parse_member_declaration()
def parse_type(s):
tokens = tokenize(s)
parser = Parser(tokens)
return parser.parse_type()
def parse_type_signature(sig):
if sig.endswith(';'):
sig = sig[:-1]
sig = sig + '{ }'
tokens = tokenize(sig)
parser = Parser(tokens)
return parser.parse_class_or_interface_declaration()
def parse(s):
tokens = tokenize(s)
parser = Parser(tokens)
return parser.parse()
================================================
FILE: baseline_tokenization/javalang/parser.py
================================================
import six
from . import util
from . import tree
from .tokenizer import (
EndOfInput, Keyword, Modifier, BasicType, Identifier,
Annotation, Literal, Operator, JavaToken,
)
ENABLE_DEBUG_SUPPORT = False
def parse_debug(method):
global ENABLE_DEBUG_SUPPORT
if ENABLE_DEBUG_SUPPORT:
def _method(self):
if not hasattr(self, 'recursion_depth'):
self.recursion_depth = 0
if self.debug:
depth = "%02d" % (self.recursion_depth,)
token = six.text_type(self.tokens.look())
start_value = self.tokens.look().value
name = method.__name__
sep = ("-" * self.recursion_depth)
e_message = ""
print("%s %s> %s(%s)" % (depth, sep, name, token))
self.recursion_depth += 1
try:
r = method(self)
except JavaSyntaxError as e:
e_message = e.description
raise
except Exception as e:
e_message = six.text_type(e)
raise
finally:
token = six.text_type(self.tokens.last())
print("%s <%s %s(%s, %s) %s" %
(depth, sep, name, start_value, token, e_message))
self.recursion_depth -= 1
else:
self.recursion_depth += 1
try:
r = method(self)
finally:
self.recursion_depth -= 1
return r
return _method
else:
return method
# ------------------------------------------------------------------------------
# ---- Parsing exception ----
class JavaParserBaseException(Exception):
def __init__(self, message=''):
super(JavaParserBaseException, self).__init__(message)
class JavaSyntaxError(JavaParserBaseException):
def __init__(self, description, at=None):
super(JavaSyntaxError, self).__init__()
self.description = description
self.at = at
class JavaParserError(JavaParserBaseException):
pass
# ------------------------------------------------------------------------------
# ---- Parser class ----
class Parser(object):
operator_precedence = [ set(('||',)),
set(('&&',)),
set(('|',)),
set(('^',)),
set(('&',)),
set(('==', '!=')),
set(('<', '>', '>=', '<=', 'instanceof')),
set(('<<', '>>', '>>>')),
set(('+', '-')),
set(('*', '/', '%')) ]
def __init__(self, tokens):
self.tokens = util.LookAheadListIterator(tokens)
self.tokens.set_default(EndOfInput(None))
self.debug = False
# ------------------------------------------------------------------------------
# ---- Debug control ----
def set_debug(self, debug=True):
self.debug = debug
# ------------------------------------------------------------------------------
# ---- Parsing entry point ----
def parse(self):
return self.parse_compilation_unit()
# ------------------------------------------------------------------------------
# ---- Helper methods ----
def illegal(self, description, at=None):
if not at:
at = self.tokens.look()
raise JavaSyntaxError(description, at)
def accept(self, *accepts):
last = None
if len(accepts) == 0:
raise JavaParserError("Missing acceptable values")
for accept in accepts:
token = next(self.tokens)
if isinstance(accept, six.string_types) and (
not token.value == accept):
self.illegal("Expected '%s'" % (accept,))
elif isinstance(accept, type) and not isinstance(token, accept):
self.illegal("Expected %s" % (accept.__name__,))
last = token
return last.value
def would_accept(self, *accepts):
if len(accepts) == 0:
raise JavaParserError("Missing acceptable values")
for i, accept in enumerate(accepts):
token = self.tokens.look(i)
if isinstance(accept, six.string_types) and (
not token.value == accept):
return False
elif isinstance(accept, type) and not isinstance(token, accept):
return False
return True
def try_accept(self, *accepts):
if len(accepts) == 0:
raise JavaParserError("Missing acceptable values")
for i, accept in enumerate(accepts):
token = self.tokens.look(i)
if isinstance(accept, six.string_types) and (
not token.value == accept):
return False
elif isinstance(accept, type) and not isinstance(token, accept):
return False
for i in range(0, len(accepts)):
next(self.tokens)
return True
def build_binary_operation(self, parts, start_level=0):
if len(parts) == 1:
return parts[0]
operands = list()
operators = list()
i = 0
for level in range(start_level, len(self.operator_precedence)):
for j in range(1, len(parts) - 1, 2):
if parts[j] in self.operator_precedence[level]:
operand = self.build_binary_operation(parts[i:j], level + 1)
operator = parts[j]
i = j + 1
operands.append(operand)
operators.append(operator)
if operands:
break
operand = self.build_binary_operation(parts[i:], level + 1)
operands.append(operand)
operation = operands[0]
for operator, operandr in zip(operators, operands[1:]):
operation = tree.BinaryOperation(operandl=operation)
operation.operator = operator
operation.operandr = operandr
return operation
def is_annotation(self, i=0):
""" Returns true if the position is the start of an annotation application
(as opposed to an annotation declaration)
"""
return (isinstance(self.tokens.look(i), Annotation)
and not self.tokens.look(i + 1).value == 'interface')
def is_annotation_declaration(self, i=0):
""" Returns true if the position is the start of an annotation application
(as opposed to an annotation declaration)
"""
return (isinstance(self.tokens.look(i), Annotation)
and self.tokens.look(i + 1).value == 'interface')
# ------------------------------------------------------------------------------
# ---- Parsing methods ----
# ------------------------------------------------------------------------------
# -- Identifiers --
@parse_debug
def parse_identifier(self):
return self.accept(Identifier)
@parse_debug
def parse_qualified_identifier(self):
qualified_identifier = list()
while True:
identifier = self.parse_identifier()
qualified_identifier.append(identifier)
if not self.try_accept('.'):
break
return '.'.join(qualified_identifier)
@parse_debug
def parse_qualified_identifier_list(self):
qualified_identifiers = list()
while True:
qualified_identifier = self.parse_qualified_identifier()
qualified_identifiers.append(qualified_identifier)
if not self.try_accept(','):
break
return qualified_identifiers
# ------------------------------------------------------------------------------
# -- Top level units --
@parse_debug
def parse_compilation_unit(self):
package = None
package_annotations = None
javadoc = None
import_declarations = list()
type_declarations = list()
self.tokens.push_marker()
next_token = self.tokens.look()
if next_token:
javadoc = next_token.javadoc
if self.is_annotation():
package_annotations = self.parse_annotations()
if self.try_accept('package'):
self.tokens.pop_marker(False)
package_name = self.parse_qualified_identifier()
package = tree.PackageDeclaration(annotations=package_annotations,
name=package_name,
documentation=javadoc)
self.accept(';')
else:
self.tokens.pop_marker(True)
package_annotations = None
while self.would_accept('import'):
import_declaration = self.parse_import_declaration()
import_declarations.append(import_declaration)
while not isinstance(self.tokens.look(), EndOfInput):
try:
type_declaration = self.parse_type_declaration()
except StopIteration:
self.illegal("Unexpected end of input")
if type_declaration:
type_declarations.append(type_declaration)
return tree.CompilationUnit(package=package,
imports=import_declarations,
types=type_declarations)
@parse_debug
def parse_import_declaration(self):
qualified_identifier = list()
static = False
import_all = False
self.accept('import')
if self.try_accept('static'):
static = True
while True:
identifier = self.parse_identifier()
qualified_identifier.append(identifier)
if self.try_accept('.'):
if self.try_accept('*'):
self.accept(';')
import_all = True
break
else:
self.accept(';')
break
return tree.Import(path='.'.join(qualified_identifier),
static=static,
wildcard=import_all)
@parse_debug
def parse_type_declaration(self):
if self.try_accept(';'):
return None
else:
return self.parse_class_or_interface_declaration()
@parse_debug
def parse_class_or_interface_declaration(self):
modifiers, annotations, javadoc = self.parse_modifiers()
type_declaration = None
token = self.tokens.look()
if token.value == 'class':
type_declaration = self.parse_normal_class_declaration()
elif token.value == 'enum':
type_declaration = self.parse_enum_declaration()
elif token.value == 'interface':
type_declaration = self.parse_normal_interface_declaration()
elif self.is_annotation_declaration():
type_declaration = self.parse_annotation_type_declaration()
else:
self.illegal("Expected type declaration")
type_declaration.modifiers = modifiers
type_declaration.annotations = annotations
type_declaration.documentation = javadoc
return type_declaration
@parse_debug
def parse_normal_class_declaration(self):
name = None
type_params = None
extends = None
implements = None
body = None
self.accept('class')
name = self.parse_identifier()
if self.would_accept('<'):
type_params = self.parse_type_parameters()
if self.try_accept('extends'):
extends = self.parse_type()
if self.try_accept('implements'):
implements = self.parse_type_list()
body = self.parse_class_body()
return tree.ClassDeclaration(name=name,
type_parameters=type_params,
extends=extends,
implements=implements,
body=body)
@parse_debug
def parse_enum_declaration(self):
name = None
implements = None
body = None
self.accept('enum')
name = self.parse_identifier()
if self.try_accept('implements'):
implements = self.parse_type_list()
body = self.parse_enum_body()
return tree.EnumDeclaration(name=name,
implements=implements,
body=body)
@parse_debug
def parse_normal_interface_declaration(self):
name = None
type_parameters = None
extends = None
body = None
self.accept('interface')
name = self.parse_identifier()
if self.would_accept('<'):
type_parameters = self.parse_type_parameters()
if self.try_accept('extends'):
extends = self.parse_type_list()
body = self.parse_interface_body()
return tree.InterfaceDeclaration(name=name,
type_parameters=type_parameters,
extends=extends,
body=body)
@parse_debug
def parse_annotation_type_declaration(self):
name = None
body = None
self.accept('@', 'interface')
name = self.parse_identifier()
body = self.parse_annotation_type_body()
return tree.AnnotationDeclaration(name=name,
body=body)
# ------------------------------------------------------------------------------
# -- Types --
@parse_debug
def parse_type(self):
java_type = None
if isinstance(self.tokens.look(), BasicType):
java_type = self.parse_basic_type()
elif isinstance(self.tokens.look(), Identifier):
java_type = self.parse_reference_type()
else:
self.illegal("Expected type")
java_type.dimensions = self.parse_array_dimension()
return java_type
@parse_debug
def parse_basic_type(self):
return tree.BasicType(name=self.accept(BasicType))
@parse_debug
def parse_reference_type(self):
reference_type = tree.ReferenceType()
tail = reference_type
while True:
tail.name = self.parse_identifier()
if self.would_accept('<'):
tail.arguments = self.parse_type_arguments()
if self.try_accept('.'):
tail.sub_type = tree.ReferenceType()
tail = tail.sub_type
else:
break
return reference_type
@parse_debug
def parse_type_arguments(self):
type_arguments = list()
self.accept('<')
while True:
type_argument = self.parse_type_argument()
type_arguments.append(type_argument)
if self.try_accept('>'):
break
self.accept(',')
return type_arguments
@parse_debug
def parse_type_argument(self):
pattern_type = None
base_type = None
if self.try_accept('?'):
if self.tokens.look().value in ('extends', 'super'):
pattern_type = self.tokens.next().value
else:
return tree.TypeArgument(pattern_type='?')
if self.would_accept(BasicType):
base_type = self.parse_basic_type()
self.accept('[', ']')
base_type.dimensions = [None]
else:
base_type = self.parse_reference_type()
base_type.dimensions = []
base_type.dimensions += self.parse_array_dimension()
return tree.TypeArgument(type=base_type,
pattern_type=pattern_type)
@parse_debug
def parse_nonwildcard_type_arguments(self):
self.accept('<')
type_arguments = self.parse_type_list()
self.accept('>')
return [tree.TypeArgument(type=t) for t in type_arguments]
@parse_debug
def parse_type_list(self):
types = list()
while True:
if self.would_accept(BasicType):
base_type = self.parse_basic_type()
self.accept('[', ']')
base_type.dimensions = [None]
else:
base_type = self.parse_reference_type()
base_type.dimensions = []
base_type.dimensions += self.parse_array_dimension()
types.append(base_type)
if not self.try_accept(','):
break
return types
@parse_debug
def parse_type_arguments_or_diamond(self):
if self.try_accept('<', '>'):
return list()
else:
return self.parse_type_arguments()
@parse_debug
def parse_nonwildcard_type_arguments_or_diamond(self):
if self.try_accept('<', '>'):
return list()
else:
return self.parse_nonwildcard_type_arguments()
@parse_debug
def parse_type_parameters(self):
type_parameters = list()
self.accept('<')
while True:
type_parameter = self.parse_type_parameter()
type_parameters.append(type_parameter)
if self.try_accept('>'):
break
else:
self.accept(',')
return type_parameters
@parse_debug
def parse_type_parameter(self):
identifier = self.parse_identifier()
extends = None
if self.try_accept('extends'):
extends = list()
while True:
reference_type = self.parse_reference_type()
extends.append(reference_type)
if not self.try_accept('&'):
break
return tree.TypeParameter(name=identifier,
extends=extends)
@parse_debug
def parse_array_dimension(self):
array_dimension = 0
while self.try_accept('[', ']'):
array_dimension += 1
return [None] * array_dimension
# ------------------------------------------------------------------------------
# -- Annotations and modifiers --
@parse_debug
def parse_modifiers(self):
annotations = list()
modifiers = set()
javadoc = None
next_token = self.tokens.look()
if next_token:
javadoc = next_token.javadoc
while True:
if self.would_accept(Modifier):
modifiers.add(self.accept(Modifier))
elif self.is_annotation():
annotation = self.parse_annotation()
annotations.append(annotation)
else:
break
return (modifiers, annotations, javadoc)
@parse_debug
def parse_annotations(self):
annotations = list()
while True:
annotation = self.parse_annotation()
annotations.append(annotation)
if not self.is_annotation():
break
return annotations
@parse_debug
def parse_annotation(self):
qualified_identifier = None
annotation_element = None
self.accept('@')
qualified_identifier = self.parse_qualified_identifier()
if self.try_accept('('):
if not self.would_accept(')'):
annotation_element = self.parse_annotation_element()
self.accept(')')
return tree.Annotation(name=qualified_identifier,
element=annotation_element)
@parse_debug
def parse_annotation_element(self):
if self.would_accept(Identifier, '='):
return self.parse_element_value_pairs()
else:
return self.parse_element_value()
@parse_debug
def parse_element_value_pairs(self):
pairs = list()
while True:
pair = self.parse_element_value_pair()
pairs.append(pair)
if not self.try_accept(','):
break
return pairs
@parse_debug
def parse_element_value_pair(self):
identifier = self.parse_identifier()
self.accept('=')
value = self.parse_element_value()
return tree.ElementValuePair(name=identifier,
value=value)
@parse_debug
def parse_element_value(self):
if self.is_annotation():
return self.parse_annotation()
elif self.would_accept('{'):
return self.parse_element_value_array_initializer()
else:
return self.parse_expressionl()
@parse_debug
def parse_element_value_array_initializer(self):
self.accept('{')
if self.try_accept('}'):
return list()
element_values = self.parse_element_values()
self.try_accept(',')
self.accept('}')
return tree.ElementArrayValue(values=element_values)
@parse_debug
def parse_element_values(self):
element_values = list()
while True:
element_value = self.parse_element_value()
element_values.append(element_value)
if self.would_accept('}') or self.would_accept(',', '}'):
break
self.accept(',')
return element_values
# ------------------------------------------------------------------------------
# -- Class body --
@parse_debug
def parse_class_body(self):
declarations = list()
self.accept('{')
while not self.would_accept('}'):
declaration = self.parse_class_body_declaration()
if declaration:
declarations.append(declaration)
self.accept('}')
return declarations
@parse_debug
def parse_class_body_declaration(self):
token = self.tokens.look()
if self.try_accept(';'):
return None
elif self.would_accept('static', '{'):
self.accept('static')
return self.parse_block()
elif self.would_accept('{'):
return self.parse_block()
else:
return self.parse_member_declaration()
@parse_debug
def parse_member_declaration(self):
modifiers, annotations, javadoc = self.parse_modifiers()
member = None
token = self.tokens.look()
if self.try_accept('void'):
method_name = self.parse_identifier()
member = self.parse_void_method_declarator_rest()
member.name = method_name
elif token.value == '<':
member = self.parse_generic_method_or_constructor_declaration()
elif token.value == 'class':
member = self.parse_normal_class_declaration()
elif token.value == 'enum':
member = self.parse_enum_declaration()
elif token.value == 'interface':
member = self.parse_normal_interface_declaration()
elif self.is_annotation_declaration():
member = self.parse_annotation_type_declaration()
elif self.would_accept(Identifier, '('):
constructor_name = self.parse_identifier()
member = self.parse_constructor_declarator_rest()
member.name = constructor_name
else:
member = self.parse_method_or_field_declaraction()
member._position = token.position
member.modifiers = modifiers
member.annotations = annotations
member.documentation = javadoc
return member
@parse_debug
def parse_method_or_field_declaraction(self):
member_type = self.parse_type()
member_name = self.parse_identifier()
member = self.parse_method_or_field_rest()
if isinstance(member, tree.MethodDeclaration):
member_type.dimensions += member.return_type.dimensions
member.name = member_name
member.return_type = member_type
else:
member.type = member_type
member.declarators[0].name = member_name
return member
@parse_debug
def parse_method_or_field_rest(self):
if self.would_accept('('):
return self.parse_method_declarator_rest()
else:
rest = self.parse_field_declarators_rest()
self.accept(';')
return rest
@parse_debug
def parse_field_declarators_rest(self):
array_dimension, initializer = self.parse_variable_declarator_rest()
declarators = [tree.VariableDeclarator(dimensions=array_dimension,
initializer=initializer)]
while self.try_accept(','):
declarator = self.parse_variable_declarator()
declarators.append(declarator)
return tree.FieldDeclaration(declarators=declarators)
@parse_debug
def parse_method_declarator_rest(self):
formal_parameters = self.parse_formal_parameters()
additional_dimensions = self.parse_array_dimension()
throws = None
body = None
if self.try_accept('throws'):
throws = self.parse_qualified_identifier_list()
if self.would_accept('{'):
body = self.parse_block()
else:
self.accept(';')
return tree.MethodDeclaration(parameters=formal_parameters,
throws=throws,
body=body,
return_type=tree.Type(dimensions=additional_dimensions))
@parse_debug
def parse_void_method_declarator_rest(self):
formal_parameters = self.parse_formal_parameters()
throws = None
body = None
if self.try_accept('throws'):
throws = self.parse_qualified_identifier_list()
if self.would_accept('{'):
body = self.parse_block()
else:
self.accept(';')
return tree.MethodDeclaration(parameters=formal_parameters,
throws=throws,
body=body)
@parse_debug
def parse_constructor_declarator_rest(self):
formal_parameters = self.parse_formal_parameters()
throws = None
body = None
if self.try_accept('throws'):
throws = self.parse_qualified_identifier_list()
body = self.parse_block()
return tree.ConstructorDeclaration(parameters=formal_parameters,
throws=throws,
body=body)
@parse_debug
def parse_generic_method_or_constructor_declaration(self):
type_parameters = self.parse_type_parameters()
method = None
if self.would_accept(Identifier, '('):
constructor_name = self.parse_identifier()
method = self.parse_constructor_declarator_rest()
method.name = constructor_name
elif self.try_accept('void'):
method_name = self.parse_identifier()
method = self.parse_void_method_declarator_rest()
method.name = method_name
else:
method_return_type = self.parse_type()
method_name = self.parse_identifier()
method = self.parse_method_declarator_rest()
method_return_type.dimensions += method.return_type.dimensions
method.return_type = method_return_type
method.name = method_name
method.type_parameters = type_parameters
return method
# ------------------------------------------------------------------------------
# -- Interface body --
@parse_debug
def parse_interface_body(self):
declarations = list()
self.accept('{')
while not self.would_accept('}'):
declaration = self.parse_interface_body_declaration()
if declaration:
declarations.append(declaration)
self.accept('}')
return declarations
@parse_debug
def parse_interface_body_declaration(self):
if self.try_accept(';'):
return None
modifiers, annotations, javadoc = self.parse_modifiers()
declaration = self.parse_interface_member_declaration()
declaration.modifiers = modifiers
declaration.annotations = annotations
declaration.documentation = javadoc
return declaration
@parse_debug
def parse_interface_member_declaration(self):
declaration = None
if self.would_accept('class'):
declaration = self.parse_normal_class_declaration()
elif self.would_accept('interface'):
declaration = self.parse_normal_interface_declaration()
elif self.would_accept('enum'):
declaration = self.parse_enum_declaration()
elif self.is_annotation_declaration():
declaration = self.parse_annotation_type_declaration()
elif self.would_accept('<'):
declaration = self.parse_interface_generic_method_declarator()
elif self.try_accept('void'):
method_name = self.parse_identifier()
declaration = self.parse_void_interface_method_declarator_rest()
declaration.name = method_name
else:
declaration = self.parse_interface_method_or_field_declaration()
return declaration
@parse_debug
def parse_interface_method_or_field_declaration(self):
java_type = self.parse_type()
name = self.parse_identifier()
member = self.parse_interface_method_or_field_rest()
if isinstance(member, tree.MethodDeclaration):
java_type.dimensions += member.return_type.dimensions
member.name = name
member.return_type = java_type
else:
member.declarators[0].name = name
member.type = java_type
return member
@parse_debug
def parse_interface_method_or_field_rest(self):
rest = None
if self.would_accept('('):
rest = self.parse_interface_method_declarator_rest()
else:
rest = self.parse_constant_declarators_rest()
self.accept(';')
return rest
@parse_debug
def parse_constant_declarators_rest(self):
array_dimension, initializer = self.parse_constant_declarator_rest()
declarators = [tree.VariableDeclarator(dimensions=array_dimension,
initializer=initializer)]
while self.try_accept(','):
declarator = self.parse_constant_declarator()
declarators.append(declarator)
return tree.ConstantDeclaration(declarators=declarators)
@parse_debug
def parse_constant_declarator_rest(self):
array_dimension = self.parse_array_dimension()
self.accept('=')
initializer = self.parse_variable_initializer()
return (array_dimension, initializer)
@parse_debug
def parse_constant_declarator(self):
name = self.parse_identifier()
additional_dimension, initializer = self.parse_constant_declarator_rest()
return tree.VariableDeclarator(name=name,
dimensions=additional_dimension,
initializer=initializer)
@parse_debug
def parse_interface_method_declarator_rest(self):
parameters = self.parse_formal_parameters()
array_dimension = self.parse_array_dimension()
throws = None
body = None
if self.try_accept('throws'):
throws = self.parse_qualified_identifier_list()
if self.would_accept('{'):
body = self.parse_block()
else:
self.accept(';')
return tree.MethodDeclaration(parameters=parameters,
throws=throws,
body=body,
return_type=tree.Type(dimensions=array_dimension))
@parse_debug
def parse_void_interface_method_declarator_rest(self):
parameters = self.parse_formal_parameters()
throws = None
body = None
if self.try_accept('throws'):
throws = self.parse_qualified_identifier_list()
if self.would_accept('{'):
body = self.parse_block()
else:
self.accept(';')
return tree.MethodDeclaration(parameters=parameters,
throws=throws,
body=body)
@parse_debug
def parse_interface_generic_method_declarator(self):
type_parameters = self.parse_type_parameters()
return_type = None
method_name = None
if not self.try_accept('void'):
return_type = self.parse_type()
method_name = self.parse_identifier()
method = self.parse_interface_method_declarator_rest()
method.name = method_name
method.return_type = return_type
method.type_parameters = type_parameters
return method
# ------------------------------------------------------------------------------
# -- Parameters and variables --
@parse_debug
def parse_formal_parameters(self):
formal_parameters = list()
self.accept('(')
if self.try_accept(')'):
return formal_parameters
while True:
modifiers, annotations = self.parse_variable_modifiers()
parameter_type = self.parse_type()
varargs = False
if self.try_accept('...'):
varargs = True
parameter_name = self.parse_identifier()
parameter_type.dimensions += self.parse_array_dimension()
parameter = tree.FormalParameter(modifiers=modifiers,
annotations=annotations,
type=parameter_type,
name=parameter_name,
varargs=varargs)
formal_parameters.append(parameter)
if varargs:
# varargs parameter must be the last
break
if not self.try_accept(','):
break
self.accept(')')
return formal_parameters
@parse_debug
def parse_variable_modifiers(self):
modifiers = set()
annotations = list()
while True:
if self.try_accept('final'):
modifiers.add('final')
elif self.is_annotation():
annotation = self.parse_annotation()
annotations.append(annotation)
else:
break
return modifiers, annotations
@parse_debug
def parse_variable_declators(self):
declarators = list()
while True:
declarator = self.parse_variable_declator()
declarators.append(declarator)
if not self.try_accept(','):
break
return declarators
@parse_debug
def parse_variable_declarators(self):
declarators = list()
while True:
declarator = self.parse_variable_declarator()
declarators.append(declarator)
if not self.try_accept(','):
break
return declarators
@parse_debug
def parse_variable_declarator(self):
identifier = self.parse_identifier()
array_dimension, initializer = self.parse_variable_declarator_rest()
return tree.VariableDeclarator(name=identifier,
dimensions=array_dimension,
initializer=initializer)
@parse_debug
def parse_variable_declarator_rest(self):
array_dimension = self.parse_array_dimension()
initializer = None
if self.try_accept('='):
initializer = self.parse_variable_initializer()
return (array_dimension, initializer)
@parse_debug
def parse_variable_initializer(self):
if self.would_accept('{'):
return self.parse_array_initializer()
else:
return self.parse_expression()
@parse_debug
def parse_array_initializer(self):
array_initializer = tree.ArrayInitializer(initializers=list())
self.accept('{')
if self.try_accept(','):
self.accept('}')
return array_initializer
if self.try_accept('}'):
return array_initializer
while True:
initializer = self.parse_variable_initializer()
array_initializer.initializers.append(initializer)
if not self.would_accept('}'):
self.accept(',')
if self.try_accept('}'):
return array_initializer
# ------------------------------------------------------------------------------
# -- Blocks and statements --
@parse_debug
def parse_block(self):
statements = list()
self.accept('{')
while not self.would_accept('}'):
statement = self.parse_block_statement()
statements.append(statement)
self.accept('}')
return statements
@parse_debug
def parse_block_statement(self):
if self.would_accept(Identifier, ':'):
# Labeled statement
return self.parse_statement()
if self.would_accept('synchronized'):
return self.parse_statement()
token = None
found_annotations = False
i = 0
# Look past annoatations and modifiers. If we find a modifier that is not
# 'final' then the statement must be a class or interface declaration
while True:
token = self.tokens.look(i)
if isinstance(token, Modifier):
if not token.value == 'final':
return self.parse_class_or_interface_declaration()
elif self.is_annotation(i):
found_annotations = True
i += 2
while self.tokens.look(i).value == '.':
i += 2
if self.tokens.look(i).value == '(':
parens = 1
i += 1
while parens > 0:
token = self.tokens.look(i)
if token.value == '(':
parens += 1
elif token.value == ')':
parens -= 1
i += 1
continue
else:
break
i += 1
if token.value in ('class', 'enum', 'interface', '@'):
return self.parse_class_or_interface_declaration()
if found_annotations or isinstance(token, BasicType):
return self.parse_local_variable_declaration_statement()
# At this point, if the block statement is a variable definition the next
# token MUST be an identifier, so if it isn't we can conclude the block
# statement is a normal statement
if not isinstance(token, Identifier):
return self.parse_statement()
# We can't easily determine the statement type. Try parsing as a variable
# declaration first and fall back to a statement
try:
with self.tokens:
return self.parse_local_variable_declaration_statement()
except JavaSyntaxError:
return self.parse_statement()
@parse_debug
def parse_local_variable_declaration_statement(self):
modifiers, annotations = self.parse_variable_modifiers()
java_type = self.parse_type()
declarators = self.parse_variable_declarators()
self.accept(';')
var = tree.LocalVariableDeclaration(modifiers=modifiers,
annotations=annotations,
type=java_type,
declarators=declarators)
return var
@parse_debug
def parse_statement(self):
token = self.tokens.look()
if self.would_accept('{'):
block = self.parse_block()
return tree.BlockStatement(statements=block)
elif self.try_accept(';'):
return tree.Statement()
elif self.would_accept(Identifier, ':'):
identifer = self.parse_identifier()
self.accept(':')
statement = self.parse_statement()
statement.label = identifer
return statement
elif self.try_accept('if'):
condition = self.parse_par_expression()
then = self.parse_statement()
else_statement = None
if self.try_accept('else'):
else_statement = self.parse_statement()
return tree.IfStatement(condition=condition,
then_statement=then,
else_statement=else_statement)
elif self.try_accept('assert'):
condition = self.parse_expression()
value = None
if self.try_accept(':'):
value = self.parse_expression()
self.accept(';')
return tree.AssertStatement(condition=condition,
value=value)
elif self.try_accept('switch'):
switch_expression = self.parse_par_expression()
self.accept('{')
switch_block = self.parse_switch_block_statement_groups()
self.accept('}')
return tree.SwitchStatement(expression=switch_expression,
cases=switch_block)
elif self.try_accept('while'):
condition = self.parse_par_expression()
action = self.parse_statement()
return tree.WhileStatement(condition=condition,
body=action)
elif self.try_accept('do'):
action = self.parse_statement()
self.accept('while')
condition = self.parse_par_expression()
self.accept(';')
return tree.DoStatement(condition=condition,
body=action)
elif self.try_accept('for'):
self.accept('(')
for_control = self.parse_for_control()
self.accept(')')
for_statement = self.parse_statement()
return tree.ForStatement(control=for_control,
body=for_statement)
elif self.try_accept('break'):
label = None
if self.would_accept(Identifier):
label = self.parse_identifier()
self.accept(';')
return tree.BreakStatement(goto=label)
elif self.try_accept('continue'):
label = None
if self.would_accept(Identifier):
label = self.parse_identifier()
self.accept(';')
return tree.ContinueStatement(goto=label)
elif self.try_accept('return'):
value = None
if not self.would_accept(';'):
value = self.parse_expression()
self.accept(';')
return tree.ReturnStatement(expression=value)
elif self.try_accept('throw'):
value = self.parse_expression()
self.accept(';')
return tree.ThrowStatement(expression=value)
elif self.try_accept('synchronized'):
lock = self.parse_par_expression()
block = self.parse_block()
return tree.SynchronizedStatement(lock=lock,
block=block)
elif self.try_accept('try'):
resource_specification = None
block = None
catches = None
finally_block = None
if self.would_accept('{'):
block = self.parse_block()
if self.would_accept('catch'):
catches = self.parse_catches()
if self.try_accept('finally'):
finally_block = self.parse_block()
if catches == None and finally_block == None:
self.illegal("Expected catch/finally block")
else:
resource_specification = self.parse_resource_specification()
block = self.parse_block()
if self.would_accept('catch'):
catches = self.parse_catches()
if self.try_accept('finally'):
finally_block = self.parse_block()
return tree.TryStatement(resources=resource_specification,
block=block,
catches=catches,
finally_block=finally_block)
else:
expression = self.parse_expression()
self.accept(';')
return tree.StatementExpression(expression=expression)
# ------------------------------------------------------------------------------
# -- Try / catch --
@parse_debug
def parse_catches(self):
catches = list()
while True:
catch = self.parse_catch_clause()
catches.append(catch)
if not self.would_accept('catch'):
break
return catches
@parse_debug
def parse_catch_clause(self):
self.accept('catch', '(')
modifiers, annotations = self.parse_variable_modifiers()
catch_parameter = tree.CatchClauseParameter(types=list())
while True:
catch_type = self.parse_qualified_identifier()
catch_parameter.types.append(catch_type)
if not self.try_accept('|'):
break
catch_parameter.name = self.parse_identifier()
self.accept(')')
block = self.parse_block()
return tree.CatchClause(parameter=catch_parameter,
block=block)
@parse_debug
def parse_resource_specification(self):
resources = list()
self.accept('(')
while True:
resource = self.parse_resource()
resources.append(resource)
if not self.would_accept(')'):
self.accept(';')
if self.try_accept(')'):
break
return resources
@parse_debug
def parse_resource(self):
modifiers, annotations = self.parse_variable_modifiers()
reference_type = self.parse_reference_type()
reference_type.dimensions = self.parse_array_dimension()
name = self.parse_identifier()
reference_type.dimensions += self.parse_array_dimension()
self.accept('=')
value = self.parse_expression()
return tree.TryResource(modifiers=modifiers,
annotations=annotations,
type=reference_type,
name=name,
value=value)
# ------------------------------------------------------------------------------
# -- Switch and for statements ---
@parse_debug
def parse_switch_block_statement_groups(self):
statement_groups = list()
while self.tokens.look().value in ('case', 'default'):
statement_group = self.parse_switch_block_statement_group()
statement_groups.append(statement_group)
return statement_groups
@parse_debug
def parse_switch_block_statement_group(self):
labels = list()
statements = list()
while True:
case_type = self.tokens.next().value
case_value = None
if case_type == 'case':
if self.would_accept(Identifier, ':'):
case_value = self.parse_identifier()
else:
case_value = self.parse_expression()
labels.append(case_value)
elif not case_type == 'default':
self.illegal("Expected switch case")
self.accept(':')
if self.tokens.look().value not in ('case', 'default'):
break
while self.tokens.look().value not in ('case', 'default', '}'):
statement = self.parse_block_statement()
statements.append(statement)
return tree.SwitchStatementCase(case=labels,
statements=statements)
@parse_debug
def parse_for_control(self):
# Try for_var_control and fall back to normal three part for control
try:
with self.tokens:
return self.parse_for_var_control()
except JavaSyntaxError:
pass
init = None
if not self.would_accept(';'):
init = self.parse_for_init_or_update()
self.accept(';')
condition = None
if not self.would_accept(';'):
condition = self.parse_expression()
self.accept(';')
update = None
if not self.would_accept(')'):
update = self.parse_for_init_or_update()
return tree.ForControl(init=init,
condition=condition,
update=update)
@parse_debug
def parse_for_var_control(self):
modifiers, annotations = self.parse_variable_modifiers()
var_type = self.parse_type()
var_name = self.parse_identifier()
var_type.dimensions += self.parse_array_dimension()
var = tree.VariableDeclaration(modifiers=modifiers,
annotations=annotations,
type=var_type)
rest = self.parse_for_var_control_rest()
if isinstance(rest, tree.Expression):
var.declarators = [tree.VariableDeclarator(name=var_name)]
return tree.EnhancedForControl(var=var,
iterable=rest)
else:
declarators, condition, update = rest
declarators[0].name = var_name
var.declarators = declarators
return tree.ForControl(init=var,
condition=condition,
update=update)
@parse_debug
def parse_for_var_control_rest(self):
if self.try_accept(':'):
expression = self.parse_expression()
return expression
declarators = None
if not self.would_accept(';'):
declarators = self.parse_for_variable_declarator_rest()
else:
declarators = [tree.VariableDeclarator()]
self.accept(';')
condition = None
if not self.would_accept(';'):
condition = self.parse_expression()
self.accept(';')
update = None
if not self.would_accept(')'):
update = self.parse_for_init_or_update()
return (declarators, condition, update)
@parse_debug
def parse_for_variable_declarator_rest(self):
initializer = None
if self.try_accept('='):
initializer = self.parse_variable_initializer()
declarators = [tree.VariableDeclarator(initializer=initializer)]
while self.try_accept(','):
declarator = self.parse_variable_declarator()
declarators.append(declarator)
return declarators
@parse_debug
def parse_for_init_or_update(self):
expressions = list()
while True:
expression = self.parse_expression()
expressions.append(expression)
if not self.try_accept(','):
break
return expressions
# ------------------------------------------------------------------------------
# -- Expressions --
@parse_debug
def parse_expression(self):
expressionl = self.parse_expressionl()
assignment_type = None
assignment_expression = None
if self.tokens.look().value in Operator.ASSIGNMENT:
assignment_type = self.tokens.next().value
assignment_expression = self.parse_expression()
return tree.Assignment(expressionl=expressionl,
type=assignment_type,
value=assignment_expression)
else:
return expressionl
@parse_debug
def parse_expressionl(self):
expression_2 = self.parse_expression_2()
true_expression = None
false_expression = None
if self.try_accept('?'):
true_expression = self.parse_expression()
self.accept(':')
false_expression = self.parse_expressionl()
return tree.TernaryExpression(condition=expression_2,
if_true=true_expression,
if_false=false_expression)
if self.would_accept('->'):
body = self.parse_lambda_method_body()
return tree.LambdaExpression(parameters=[expression_2],
body=body)
if self.try_accept('::'):
method_reference, type_arguments = self.parse_method_reference()
return tree.MethodReference(
expression=expression_2,
method=method_reference,
type_arguments=type_arguments)
return expression_2
@parse_debug
def parse_expression_2(self):
expression_3 = self.parse_expression_3()
token = self.tokens.look()
if token.value in Operator.INFIX or token.value == 'instanceof':
parts = self.parse_expression_2_rest()
parts.insert(0, expression_3)
return self.build_binary_operation(parts)
return expression_3
@parse_debug
def parse_expression_2_rest(self):
parts = list()
token = self.tokens.look()
while token.value in Operator.INFIX or token.value == 'instanceof':
if self.try_accept('instanceof'):
comparison_type = self.parse_type()
parts.extend(('instanceof', comparison_type))
else:
operator = self.parse_infix_operator()
expression = self.parse_expression_3()
parts.extend((operator, expression))
token = self.tokens.look()
return parts
# ------------------------------------------------------------------------------
# -- Expression operators --
@parse_debug
def parse_expression_3(self):
prefix_operators = list()
while self.tokens.look().value in Operator.PREFIX:
prefix_operators.append(self.tokens.next().value)
if self.would_accept('('):
try:
with self.tokens:
lambda_exp = self.parse_lambda_expression()
if lambda_exp:
return lambda_exp
except JavaSyntaxError:
pass
try:
with self.tokens:
self.accept('(')
cast_target = self.parse_type()
self.accept(')')
expression = self.parse_expression_3()
return tree.Cast(type=cast_target,
expression=expression)
except JavaSyntaxError:
pass
primary = self.parse_primary()
primary.prefix_operators = prefix_operators
primary.selectors = list()
primary.postfix_operators = list()
token = self.tokens.look()
while token.value in '[.':
selector = self.parse_selector()
primary.selectors.append(selector)
token = self.tokens.look()
while token.value in Operator.POSTFIX:
primary.postfix_operators.append(self.tokens.next().value)
token = self.tokens.look()
return primary
@parse_debug
def parse_method_reference(self):
type_arguments = list()
if self.would_accept('<'):
type_arguments = self.parse_nonwildcard_type_arguments()
if self.would_accept('new'):
method_reference = tree.MemberReference(member=self.accept('new'))
else:
method_reference = self.parse_expression()
return method_reference, type_arguments
@parse_debug
def parse_lambda_expression(self):
lambda_expr = None
parameters = None
if self.would_accept('(', Identifier, ','):
self.accept('(')
parameters = []
while not self.would_accept(')'):
parameters.append(tree.InferredFormalParameter(
name=self.parse_identifier()))
self.try_accept(',')
self.accept(')')
else:
parameters = self.parse_formal_parameters()
body = self.parse_lambda_method_body()
return tree.LambdaExpression(parameters=parameters,
body=body)
@parse_debug
def parse_lambda_method_body(self):
if self.accept('->'):
if self.would_accept('{'):
return self.parse_block()
else:
return self.parse_expression()
@parse_debug
def parse_infix_operator(self):
operator = self.accept(Operator)
if not operator in Operator.INFIX:
self.illegal("Expected infix operator")
if operator == '>' and self.try_accept('>'):
operator = '>>'
if self.try_accept('>'):
operator = '>>>'
return operator
# ------------------------------------------------------------------------------
# -- Primary expressions --
@parse_debug
def parse_primary(self):
token = self.tokens.look()
if isinstance(token, Literal):
return self.parse_literal()
elif token.value == '(':
return self.parse_par_expression()
elif self.try_accept('this'):
arguments = None
if self.would_accept('('):
arguments = self.parse_arguments()
return tree.ExplicitConstructorInvocation(arguments=arguments)
return tree.This()
elif self.would_accept('super', '::'):
self.accept('super')
return token
elif self.try_accept('super'):
super_suffix = self.parse_super_suffix()
return super_suffix
elif self.try_accept('new'):
return self.parse_creator()
elif token.value == '<':
type_arguments = self.parse_nonwildcard_type_arguments()
if self.try_accept('this'):
arguments = self.parse_arguments()
return tree.ExplicitConstructorInvocation(type_arguments=type_arguments,
arguments=arguments)
else:
invocation = self.parse_explicit_generic_invocation_suffix()
invocation.type_arguments = type_arguments
return invocation
elif isinstance(token, Identifier):
qualified_identifier = [self.parse_identifier()]
while self.would_accept('.', Identifier):
self.accept('.')
identifier = self.parse_identifier()
qualified_identifier.append(identifier)
identifier_suffix = self.parse_identifier_suffix()
if isinstance(identifier_suffix, (tree.MemberReference, tree.MethodInvocation)):
# Take the last identifer as the member and leave the rest for the qualifier
identifier_suffix.member = qualified_identifier.pop()
elif isinstance(identifier_suffix, tree.ClassReference):
identifier_suffix.type = tree.ReferenceType(name=qualified_identifier.pop())
identifier_suffix.qualifier = '.'.join(qualified_identifier)
return identifier_suffix
elif isinstance(token, BasicType):
base_type = self.parse_basic_type()
base_type.dimensions = self.parse_array_dimension()
self.accept('.', 'class')
return tree.ClassReference(type=base_type)
elif self.try_accept('void'):
self.accept('.', 'class')
return tree.VoidClassReference()
self.illegal("Expected expression")
@parse_debug
def parse_literal(self):
literal = self.accept(Literal)
return tree.Literal(value=literal)
@parse_debug
def parse_par_expression(self):
self.accept('(')
expression = self.parse_expression()
self.accept(')')
return expression
@parse_debug
def parse_arguments(self):
expressions = list()
self.accept('(')
if self.try_accept(')'):
return expressions
while True:
expression = self.parse_expression()
expressions.append(expression)
if not self.try_accept(','):
break
self.accept(')')
return expressions
@parse_debug
def parse_super_suffix(self):
identifier = None
type_arguments = None
arguments = None
if self.try_accept('.'):
if self.would_accept('<'):
type_arguments = self.parse_nonwildcard_type_arguments()
identifier = self.parse_identifier()
if self.would_accept('('):
arguments = self.parse_arguments()
else:
arguments = self.parse_arguments()
if identifier and arguments is not None:
return tree.SuperMethodInvocation(member=identifier,
arguments=arguments,
type_arguments=type_arguments)
elif arguments is not None:
return tree.SuperConstructorInvocation(arguments=arguments)
else:
return tree.SuperMemberReference(member=identifier)
@parse_debug
def parse_explicit_generic_invocation_suffix(self):
identifier = None
arguments = None
if self.try_accept('super'):
return self.parse_super_suffix()
else:
identifier = self.parse_identifier()
arguments = self.parse_arguments()
return tree.MethodInvocation(member=identifier,
arguments=arguments)
# ------------------------------------------------------------------------------
# -- Creators --
@parse_debug
def parse_creator(self):
constructor_type_arguments = None
if self.would_accept(BasicType):
created_name = self.parse_basic_type()
rest = self.parse_array_creator_rest()
rest.type = created_name
return rest
if self.would_accept('<'):
constructor_type_arguments = self.parse_nonwildcard_type_arguments()
created_name = self.parse_created_name()
if self.would_accept('['):
if constructor_type_arguments:
self.illegal("Array creator not allowed with generic constructor type arguments")
rest = self.parse_array_creator_rest()
rest.type = created_name
return rest
else:
arguments, body = self.parse_class_creator_rest()
return tree.ClassCreator(constructor_type_arguments=constructor_type_arguments,
type=created_name,
arguments=arguments,
body=body)
@parse_debug
def parse_created_name(self):
created_name = tree.ReferenceType()
tail = created_name
while True:
tail.name = self.parse_identifier()
if self.would_accept('<'):
tail.arguments = self.parse_type_arguments_or_diamond()
if self.try_accept('.'):
tail.sub_type = tree.ReferenceType()
tail = tail.sub_type
else:
break
return created_name
@parse_debug
def parse_class_creator_rest(self):
arguments = self.parse_arguments()
class_body = None
if self.would_accept('{'):
class_body = self.parse_class_body()
return (arguments, class_body)
@parse_debug
def parse_array_creator_rest(self):
if self.would_accept('[', ']'):
array_dimension = self.parse_array_dimension()
array_initializer = self.parse_array_initializer()
return tree.ArrayCreator(dimensions=array_dimension,
initializer=array_initializer)
else:
array_dimensions = list()
while self.would_accept('[') and not self.would_accept('[', ']'):
self.accept('[')
expression = self.parse_expression()
array_dimensions.append(expression)
self.accept(']')
array_dimensions += self.parse_array_dimension()
return tree.ArrayCreator(dimensions=array_dimensions)
@parse_debug
def parse_identifier_suffix(self):
if self.try_accept('[', ']'):
array_dimension = [None] + self.parse_array_dimension()
self.accept('.', 'class')
return tree.ClassReference(type=tree.Type(dimensions=array_dimension))
elif self.would_accept('('):
arguments = self.parse_arguments()
return tree.MethodInvocation(arguments=arguments)
elif self.try_accept('.', 'class'):
return tree.ClassReference()
elif self.try_accept('.', 'this'):
return tree.This()
elif self.would_accept('.', '<'):
next(self.tokens)
return self.parse_explicit_generic_invocation()
elif self.try_accept('.', 'new'):
type_arguments = None
if self.would_accept('<'):
type_arguments = self.parse_nonwildcard_type_arguments()
inner_creator = self.parse_inner_creator()
inner_creator.constructor_type_arguments = type_arguments
return inner_creator
elif self.would_accept('.', 'super', '('):
self.accept('.', 'super')
arguments = self.parse_arguments()
return tree.SuperConstructorInvocation(arguments=arguments)
else:
return tree.MemberReference()
@parse_debug
def parse_explicit_generic_invocation(self):
type_arguments = self.parse_nonwildcard_type_arguments()
invocation = self.parse_explicit_generic_invocation_suffix()
invocation.type_arguments = type_arguments
return invocation
@parse_debug
def parse_inner_creator(self):
identifier = self.parse_identifier()
type_arguments = None
if self.would_accept('<'):
type_arguments = self.parse_nonwildcard_type_arguments_or_diamond()
java_type = tree.ReferenceType(name=identifier,
arguments=type_arguments)
arguments, class_body = self.parse_class_creator_rest()
return tree.InnerClassCreator(type=java_type,
arguments=arguments,
body=class_body)
@parse_debug
def parse_selector(self):
if self.try_accept('['):
expression = self.parse_expression()
self.accept(']')
return tree.ArraySelector(index=expression)
elif self.try_accept('.'):
token = self.tokens.look()
if isinstance(token, Identifier):
identifier = self.tokens.next().value
arguments = None
if self.would_accept('('):
arguments = self.parse_arguments()
return tree.MethodInvocation(member=identifier,
arguments=arguments)
else:
return tree.MemberReference(member=identifier)
elif self.would_accept('super', '::'):
self.accept('super')
return token
elif self.would_accept('<'):
return self.parse_explicit_generic_invocation()
elif self.try_accept('this'):
return tree.This()
elif self.try_accept('super'):
return self.parse_super_suffix()
elif self.try_accept('new'):
type_arguments = None
if self.would_accept('<'):
type_arguments = self.parse_nonwildcard_type_arguments()
inner_creator = self.parse_inner_creator()
inner_creator.constructor_type_arguments = type_arguments
return inner_creator
self.illegal("Expected selector")
# ------------------------------------------------------------------------------
# -- Enum and annotation body --
@parse_debug
def parse_enum_body(self):
constants = list()
body_declarations = list()
self.accept('{')
if not self.try_accept(','):
while not (self.would_accept(';') or self.would_accept('}')):
constant = self.parse_enum_constant()
constants.append(constant)
if not self.try_accept(','):
break
if self.try_accept(';'):
while not self.would_accept('}'):
declaration = self.parse_class_body_declaration()
if declaration:
body_declarations.append(declaration)
self.accept('}')
return tree.EnumBody(constants=constants,
declarations=body_declarations)
@parse_debug
def parse_enum_constant(self):
annotations = list()
javadoc = None
constant_name = None
arguments = None
body = None
next_token = self.tokens.look()
if next_token:
javadoc = next_token.javadoc
if self.would_accept(Annotation):
annotations = self.parse_annotations()
constant_name = self.parse_identifier()
if self.would_accept('('):
arguments = self.parse_arguments()
if self.would_accept('{'):
body = self.parse_class_body()
return tree.EnumConstantDeclaration(annotations=annotations,
name=constant_name,
arguments=arguments,
body=body,
documentation=javadoc)
@parse_debug
def parse_annotation_type_body(self):
declarations = None
self.accept('{')
declarations = self.parse_annotation_type_element_declarations()
self.accept('}')
return declarations
@parse_debug
def parse_annotation_type_element_declarations(self):
declarations = list()
while not self.would_accept('}'):
declaration = self.parse_annotation_type_element_declaration()
declarations.append(declaration)
return declarations
@parse_debug
def parse_annotation_type_element_declaration(self):
modifiers, annotations, javadoc = self.parse_modifiers()
declaration = None
if self.would_accept('class'):
declaration = self.parse_normal_class_declaration()
elif self.would_accept('interface'):
declaration = self.parse_normal_interface_declaration()
elif self.would_accept('enum'):
declaration = self.parse_enum_declaration()
elif self.is_annotation_declaration():
declaration = self.parse_annotation_type_declaration()
else:
attribute_type = self.parse_type()
attribute_name = self.parse_identifier()
declaration = self.parse_annotation_method_or_constant_rest()
self.accept(';')
if isinstance(declaration, tree.AnnotationMethod):
declaration.name = attribute_name
declaration.return_type = attribute_type
else:
declaration.declarators[0].name = attribute_name
declaration.type = attribute_type
declaration.modifiers = modifiers
declaration.annotations = annotations
declaration.documentation = javadoc
return declaration
@parse_debug
def parse_annotation_method_or_constant_rest(self):
if self.try_accept('('):
self.accept(')')
array_dimension = self.parse_array_dimension()
default = None
if self.try_accept('default'):
default = self.parse_element_value()
return tree.AnnotationMethod(dimensions=array_dimension,
default=default)
else:
return self.parse_constant_declarators_rest()
def parse(tokens, debug=False):
parser = Parser(tokens)
parser.set_debug(debug)
return parser.parse()
================================================
FILE: baseline_tokenization/javalang/test/__init__.py
================================================
================================================
FILE: baseline_tokenization/javalang/test/source/package-info/AnnotationJavadoc.java
================================================
@Package
/**
Test that includes java doc first but no annotation
*/
package org.javalang.test;
================================================
FILE: baseline_tokenization/javalang/test/source/package-info/AnnotationOnly.java
================================================
@Package
package org.javalang.test;
================================================
FILE: baseline_tokenization/javalang/test/source/package-info/JavadocAnnotation.java
================================================
/**
Test that includes java doc first but no annotation
*/
@Package
package org.javalang.test;
================================================
FILE: baseline_tokenization/javalang/test/source/package-info/JavadocOnly.java
================================================
/**
Test that includes java doc first but no annotation
*/
package org.javalang.test;
================================================
FILE: baseline_tokenization/javalang/test/source/package-info/NoAnnotationNoJavadoc.java
================================================
package org.javalang.test;
================================================
FILE: baseline_tokenization/javalang/test/test_java_8_syntax.py
================================================
import unittest
from pkg_resources import resource_string
from .. import parse, parser, tree
def setup_java_class(content_to_add):
""" returns an example java class with the
given content_to_add contained within a method.
"""
template = """
public class Lambda {
public static void main(String args[]) {
%s
}
}
"""
return template % content_to_add
def filter_type_in_method(clazz, the_type, method_name):
""" yields the result of filtering the given class for the given
type inside the given method identified by its name.
"""
for path, node in clazz.filter(the_type):
for p in reversed(path):
if isinstance(p, tree.MethodDeclaration):
if p.name == method_name:
yield path, node
class LambdaSupportTest(unittest.TestCase):
""" Contains tests for java 8 lambda syntax. """
def assert_contains_lambda_expression_in_m(
self, clazz, method_name='main'):
""" asserts that the given tree contains a method with the supplied
method name containing a lambda expression.
"""
matches = list(filter_type_in_method(
clazz, tree.LambdaExpression, method_name))
if not matches:
self.fail('No matching lambda expression found.')
return matches
def test_lambda_support_no_parameters_no_body(self):
""" tests support for lambda with no parameters and no body. """
self.assert_contains_lambda_expression_in_m(
parse.parse(setup_java_class("() -> {};")))
def test_lambda_support_no_parameters_expression_body(self):
""" tests support for lambda with no parameters and an
expression body.
"""
test_classes = [
setup_java_class("() -> 3;"),
setup_java_class("() -> null;"),
setup_java_class("() -> { return 21; };"),
setup_java_class("() -> { System.exit(1); };"),
]
for test_class in test_classes:
clazz = parse.parse(test_class)
self.assert_contains_lambda_expression_in_m(clazz)
def test_lambda_support_no_parameters_complex_expression(self):
""" tests support for lambda with no parameters and a
complex expression body.
"""
code = """
() -> {
if (true) return 21;
else
{
int result = 21;
return result / 2;
}
};"""
self.assert_contains_lambda_expression_in_m(
parse.parse(setup_java_class(code)))
def test_parameter_no_type_expression_body(self):
""" tests support for lambda with parameters with inferred types. """
test_classes = [
setup_java_class("(bar) -> bar + 1;"),
setup_java_class("bar -> bar + 1;"),
setup_java_class("x -> x.length();"),
setup_java_class("y -> { y.boom(); };"),
]
for test_class in test_classes:
clazz = parse.parse(test_class)
self.assert_contains_lambda_expression_in_m(clazz)
def test_parameter_with_type_expression_body(self):
""" tests support for lambda with parameters with formal types. """
test_classes = [
s
gitextract_t37_dsto/ ├── .gitignore ├── CITATION.cff ├── CSharpExtractor/ │ ├── .gitattributes │ ├── .gitignore │ ├── CSharpExtractor/ │ │ ├── .nuget/ │ │ │ └── packages.config │ │ ├── CSharpExtractor.sln │ │ └── Extractor/ │ │ ├── Extractor.cs │ │ ├── Extractor.csproj │ │ ├── PathFinder.cs │ │ ├── Program.cs │ │ ├── Properties/ │ │ │ └── launchSettings.json │ │ ├── Temp.cs │ │ ├── Tree/ │ │ │ └── Tree.cs │ │ ├── Utilities.cs │ │ └── Variable.cs │ └── extract.py ├── Input.java ├── JavaExtractor/ │ ├── JPredict/ │ │ ├── .classpath │ │ ├── .gitignore │ │ ├── src/ │ │ │ └── main/ │ │ │ └── java/ │ │ │ ├── JavaExtractor/ │ │ │ │ ├── App.java │ │ │ │ ├── Common/ │ │ │ │ │ ├── CommandLineValues.java │ │ │ │ │ ├── Common.java │ │ │ │ │ └── MethodContent.java │ │ │ │ ├── ExtractFeaturesTask.java │ │ │ │ ├── FeatureExtractor.java │ │ │ │ ├── FeaturesEntities/ │ │ │ │ │ ├── ProgramFeatures.java │ │ │ │ │ ├── ProgramRelation.java │ │ │ │ │ └── Property.java │ │ │ │ └── Visitors/ │ │ │ │ ├── FunctionVisitor.java │ │ │ │ └── LeavesCollectorVisitor.java │ │ │ └── Test.java │ │ └── target/ │ │ └── JavaExtractor-0.0.1-SNAPSHOT.jar │ └── extract.py ├── LICENSE ├── Python150kExtractor/ │ ├── README.md │ ├── extract.py │ └── preprocess.sh ├── README.md ├── __init__.py ├── baseline_tokenization/ │ ├── input_example.txt │ ├── javalang/ │ │ ├── __init__.py │ │ ├── ast.py │ │ ├── javadoc.py │ │ ├── parse.py │ │ ├── parser.py │ │ ├── test/ │ │ │ ├── __init__.py │ │ │ ├── source/ │ │ │ │ └── package-info/ │ │ │ │ ├── AnnotationJavadoc.java │ │ │ │ ├── AnnotationOnly.java │ │ │ │ ├── JavadocAnnotation.java │ │ │ │ ├── JavadocOnly.java │ │ │ │ └── NoAnnotationNoJavadoc.java │ │ │ ├── test_java_8_syntax.py │ │ │ ├── test_javadoc.py │ │ │ ├── test_package_declaration.py │ │ │ └── test_util.py │ │ ├── tokenizer.py │ │ ├── tree.py │ │ └── util.py │ └── subtokenize_nmt_baseline.py ├── code2seq.py ├── common.py ├── config.py ├── extractor.py ├── interactive_predict.py ├── model.py ├── preprocess.py ├── preprocess.sh ├── preprocess_csharp.sh ├── reader.py ├── train.sh └── train_python150k.sh
SYMBOL INDEX (557 symbols across 42 files)
FILE: CSharpExtractor/CSharpExtractor/Extractor/Extractor.cs
class Extractor (line 14) | public class Extractor
method Extractor (line 31) | public Extractor(string code, Options opts)
method PathNodesToString (line 43) | private string PathNodesToString(PathFinder.Path path)
method GetTruncatedChildId (line 87) | private int GetTruncatedChildId(SyntaxNode n)
method PathToString (line 98) | private string PathToString(PathFinder.Path path)
method GetInternalPaths (line 108) | internal IEnumerable<PathFinder.Path> GetInternalPaths(Tree tree)
method SplitNameUnlessEmpty (line 137) | private string SplitNameUnlessEmpty(string original)
method Extract (line 165) | public List<String> Extract()
method MaybeHash (line 221) | private string MaybeHash(string v)
FILE: CSharpExtractor/CSharpExtractor/Extractor/PathFinder.cs
class PathFinder (line 10) | internal class PathFinder
class Path (line 12) | internal class Path
method Path (line 20) | public Path(SyntaxToken left, IEnumerable<SyntaxNode> leftSide, Synt...
method PathFinder (line 36) | public PathFinder(Tree tree, int length = 7, int width = 4)
method GetDepth (line 46) | private int GetDepth(SyntaxNode n)
method FirstAncestor (line 57) | public SyntaxNode FirstAncestor(SyntaxNode l, SyntaxNode r)
method CollectPathToParent (line 73) | private IEnumerable<SyntaxNode> CollectPathToParent(SyntaxNode start, ...
method FindPath (line 82) | internal Path FindPath(SyntaxToken l, SyntaxToken r, bool limited = true)
FILE: CSharpExtractor/CSharpExtractor/Extractor/Program.cs
class Program (line 10) | class Program
method ExtractSingleFile (line 12) | static List<String> ExtractSingleFile(string filename, Options opts)
method Main (line 21) | static void Main(string[] args)
FILE: CSharpExtractor/CSharpExtractor/Extractor/Temp.cs
class Temp (line 3) | class Temp
class NestedClass (line 5) | class NestedClass
method fooBar (line 7) | void fooBar()
FILE: CSharpExtractor/CSharpExtractor/Extractor/Tree/Tree.cs
class Tree (line 12) | public class Tree
method IsScopeEnder (line 31) | public static bool IsScopeEnder(SyntaxNode node)
class TreeBuilderWalker (line 36) | class TreeBuilderWalker : CSharpSyntaxWalker
method TreeBuilderWalker (line 44) | internal TreeBuilderWalker(Dictionary<SyntaxNode, Node> nodes, Dicti...
method Visit (line 51) | public override
method GetRoot (line 91) | internal SyntaxNode GetRoot()
method Tree (line 100) | public Tree(SyntaxNode syntaxTree)
class Node (line 116) | public class Node
method Node (line 118) | public Node(SyntaxNode This, HashSet<SyntaxNode> Ancestors, SyntaxNode...
method Equals (line 148) | public override bool Equals(object obj)
method GetHashCode (line 160) | public override int GetHashCode()
class Leaf (line 166) | public class Leaf
method IsLeafToken (line 168) | internal static bool IsLeafToken(SyntaxToken token)
method Leaf (line 192) | public Leaf(Dictionary<SyntaxNode, Node> nodes, SyntaxToken token)
class SyntaxViewer (line 206) | public class SyntaxViewer
method ToDot (line 208) | private string ToDot(SyntaxTree tree)
method SyntaxViewer (line 257) | public SyntaxViewer(SyntaxTree tree, string path = "out.ong")
FILE: CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs
class Options (line 11) | public class Options
class Utilities (line 35) | public static class Utilities
method Choose2 (line 38) | public static IEnumerable<Tuple<T, T>> Choose2<T>(IEnumerable<T> enume...
method ReservoirSample (line 58) | public static IEnumerable<TSource> ReservoirSample<TSource>(this IEnum...
method WeakConcat (line 84) | public static IEnumerable<T> WeakConcat<T>(IEnumerable<T> enumerable1,...
method SplitToSubtokens (line 92) | public static IEnumerable<String> SplitToSubtokens(String name)
method NormalizeName (line 103) | public static String NormalizeName(string s)
FILE: CSharpExtractor/CSharpExtractor/Extractor/Variable.cs
class Variable (line 11) | public class Variable
method Variable (line 35) | private Variable(string name, SyntaxToken[] leaves, Tree tree)
method GetHashCode (line 54) | public override int GetHashCode()
method IsLiteral (line 59) | public bool IsLiteral()
method isMethodName (line 64) | internal static Boolean isMethodName(SyntaxToken token)
method CreateFromMethod (line 71) | internal static IEnumerable<Variable> CreateFromMethod(Tree methodTree)
FILE: CSharpExtractor/extract.py
function get_immediate_subdirectories (line 16) | def get_immediate_subdirectories(a_dir):
function ParallelExtractDir (line 23) | def ParallelExtractDir(args, dir):
function ExtractFeaturesForDir (line 27) | def ExtractFeaturesForDir(args, dir, prefix):
function ExtractFeaturesForDirsList (line 58) | def ExtractFeaturesForDirsList(args, dirs):
FILE: Input.java
method getName (line 1) | public String getName() {
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java
class App (line 16) | public class App {
method main (line 19) | public static void main(String[] args) {
method extractDir (line 36) | private static void extractDir() {
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java
class CommandLineValues (line 12) | public class CommandLineValues {
method CommandLineValues (line 46) | public CommandLineValues(String... args) throws CmdLineException {
method CommandLineValues (line 57) | public CommandLineValues() {
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java
class Common (line 11) | public final class Common {
method normalizeName (line 26) | public static String normalizeName(String original, String defaultStri...
method isMethod (line 45) | public static boolean isMethod(Node node, String type) {
method splitToSubtokens (line 55) | public static ArrayList<String> splitToSubtokens(String str1) {
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java
class MethodContent (line 7) | public class MethodContent {
method MethodContent (line 13) | public MethodContent(ArrayList<Node> leaves, String name, String conte...
method getLeaves (line 19) | public ArrayList<Node> getLeaves() {
method getName (line 23) | public String getName() {
method getContent (line 27) | public String getContent() {
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java
class ExtractFeaturesTask (line 17) | class ExtractFeaturesTask implements Callable<Void> {
method ExtractFeaturesTask (line 21) | public ExtractFeaturesTask(CommandLineValues commandLineValues, Path p...
method call (line 26) | @Override
method processFile (line 32) | public void processFile() {
method extractSingleFile (line 50) | private ArrayList<ProgramFeatures> extractSingleFile() throws IOExcept...
method featuresToString (line 68) | public String featuresToString(ArrayList<ProgramFeatures> features) {
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeatureExtractor.java
class FeatureExtractor (line 23) | @SuppressWarnings("StringEquality")
method FeatureExtractor (line 33) | public FeatureExtractor(CommandLineValues commandLineValues, Path file...
method getTreeStack (line 38) | private static ArrayList<Node> getTreeStack(Node node) {
method extractFeatures (line 48) | public ArrayList<ProgramFeatures> extractFeatures(String code) {
method parseFileWithRetries (line 59) | private CompilationUnit parseFileWithRetries(String code) {
method generatePathFeatures (line 84) | private ArrayList<ProgramFeatures> generatePathFeatures(ArrayList<Meth...
method generatePathFeaturesForFunction (line 95) | private ProgramFeatures generatePathFeaturesForFunction(MethodContent ...
method generatePath (line 115) | private String generatePath(Node source, Node target, String separator) {
method saturateChildId (line 184) | private Integer saturateChildId(int childId) {
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java
class ProgramFeatures (line 7) | public class ProgramFeatures {
method ProgramFeatures (line 15) | public ProgramFeatures(String name, Path filePath, String textContent) {
method toString (line 22) | @SuppressWarnings("StringBufferReplaceableByString")
method addFeature (line 32) | public void addFeature(Property source, String path, Property target) {
method isEmpty (line 37) | public boolean isEmpty() {
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java
class ProgramRelation (line 3) | public class ProgramRelation {
method ProgramRelation (line 8) | public ProgramRelation(Property sourceName, Property targetName, Strin...
method toString (line 14) | public String toString() {
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/Property.java
class Property (line 15) | public class Property {
method Property (line 137) | public Property(Node node, boolean isLeaf, boolean isGenericParent) {
method getRawType (line 189) | public String getRawType() {
method getType (line 193) | public String getType() {
method getType (line 197) | public String getType(boolean shorten) {
method getName (line 205) | public String getName() {
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java
class FunctionVisitor (line 13) | @SuppressWarnings("StringEquality")
method FunctionVisitor (line 18) | public FunctionVisitor(CommandLineValues commandLineValues) {
method visit (line 22) | @Override
method visitMethod (line 29) | private void visitMethod(MethodDeclaration node) {
method getMethodLength (line 52) | private long getMethodLength(String code) {
method getMethodContents (line 66) | public ArrayList<MethodContent> getMethodContents() {
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java
class LeavesCollectorVisitor (line 15) | public class LeavesCollectorVisitor extends TreeVisitor {
method process (line 18) | @Override
method isGenericParent (line 38) | private boolean isGenericParent(Node node) {
method hasNoChildren (line 44) | private boolean hasNoChildren(Node node) {
method isNotComment (line 48) | private boolean isNotComment(Node node) {
method getLeaves (line 52) | public ArrayList<Node> getLeaves() {
method getChildId (line 56) | private int getChildId(Node node) {
FILE: JavaExtractor/JPredict/src/main/java/Test.java
class Test (line 1) | class Test {
method fooBar (line 2) | void fooBar() {
FILE: JavaExtractor/extract.py
function get_immediate_subdirectories (line 13) | def get_immediate_subdirectories(a_dir):
function ParallelExtractDir (line 21) | def ParallelExtractDir(args, dir):
function ExtractFeaturesForDir (line 25) | def ExtractFeaturesForDir(args, dir, prefix):
function ExtractFeaturesForDirsList (line 59) | def ExtractFeaturesForDirsList(args, dirs):
FILE: Python150kExtractor/extract.py
function __collect_asts (line 27) | def __collect_asts(json_file):
function __terminals (line 33) | def __terminals(ast, node_index, args):
function __merge_terminals2_paths (line 66) | def __merge_terminals2_paths(v_path, u_path):
function __raw_tree_paths (line 78) | def __raw_tree_paths(ast, node_index, args):
function __delim_name (line 96) | def __delim_name(name):
function __collect_sample (line 114) | def __collect_sample(ast, fd_index, args):
function __collect_samples (line 141) | def __collect_samples(ast, args):
function __collect_all_and_save (line 152) | def __collect_all_and_save(asts, args, output_file):
function main (line 164) | def main():
FILE: baseline_tokenization/javalang/ast.py
class MetaNode (line 6) | class MetaNode(type):
method __new__ (line 7) | def __new__(mcs, name, bases, dict):
class Node (line 21) | class Node(object):
method __init__ (line 24) | def __init__(self, **kwargs):
method __equals__ (line 34) | def __equals__(self, other):
method __repr__ (line 44) | def __repr__(self):
method __iter__ (line 47) | def __iter__(self):
method filter (line 50) | def filter(self, pattern):
method children (line 57) | def children(self):
function walk_tree (line 60) | def walk_tree(root):
function dump (line 74) | def dump(ast, file):
function load (line 77) | def load(file):
FILE: baseline_tokenization/javalang/javadoc.py
function join (line 4) | def join(s):
class DocBlock (line 7) | class DocBlock(object):
method __init__ (line 8) | def __init__(self):
method add_block (line 22) | def add_block(self, name, value):
function _sanitize (line 54) | def _sanitize(s):
function _uncomment (line 64) | def _uncomment(s):
function _get_indent_level (line 70) | def _get_indent_level(s):
function _left_justify (line 73) | def _left_justify(s):
function _force_blocks_left (line 92) | def _force_blocks_left(s):
function parse (line 95) | def parse(raw):
FILE: baseline_tokenization/javalang/parse.py
function parse_expression (line 5) | def parse_expression(exp):
function parse_member_signature (line 14) | def parse_member_signature(sig):
function parse_constructor_signature (line 23) | def parse_constructor_signature(sig):
function parse_type (line 34) | def parse_type(s):
function parse_type_signature (line 40) | def parse_type_signature(sig):
function parse (line 50) | def parse(s):
FILE: baseline_tokenization/javalang/parser.py
function parse_debug (line 12) | def parse_debug(method):
class JavaParserBaseException (line 65) | class JavaParserBaseException(Exception):
method __init__ (line 66) | def __init__(self, message=''):
class JavaSyntaxError (line 69) | class JavaSyntaxError(JavaParserBaseException):
method __init__ (line 70) | def __init__(self, description, at=None):
class JavaParserError (line 76) | class JavaParserError(JavaParserBaseException):
class Parser (line 82) | class Parser(object):
method __init__ (line 94) | def __init__(self, tokens):
method set_debug (line 103) | def set_debug(self, debug=True):
method parse (line 109) | def parse(self):
method illegal (line 115) | def illegal(self, description, at=None):
method accept (line 121) | def accept(self, *accepts):
method would_accept (line 139) | def would_accept(self, *accepts):
method try_accept (line 154) | def try_accept(self, *accepts):
method build_binary_operation (line 172) | def build_binary_operation(self, parts, start_level=0):
method is_annotation (line 206) | def is_annotation(self, i=0):
method is_annotation_declaration (line 215) | def is_annotation_declaration(self, i=0):
method parse_identifier (line 231) | def parse_identifier(self):
method parse_qualified_identifier (line 235) | def parse_qualified_identifier(self):
method parse_qualified_identifier_list (line 248) | def parse_qualified_identifier_list(self):
method parse_compilation_unit (line 264) | def parse_compilation_unit(self):
method parse_import_declaration (line 308) | def parse_import_declaration(self):
method parse_type_declaration (line 337) | def parse_type_declaration(self):
method parse_class_or_interface_declaration (line 344) | def parse_class_or_interface_declaration(self):
method parse_normal_class_declaration (line 367) | def parse_normal_class_declaration(self):
method parse_enum_declaration (line 396) | def parse_enum_declaration(self):
method parse_normal_interface_declaration (line 414) | def parse_normal_interface_declaration(self):
method parse_annotation_type_declaration (line 437) | def parse_annotation_type_declaration(self):
method parse_type (line 453) | def parse_type(self):
method parse_basic_type (line 468) | def parse_basic_type(self):
method parse_reference_type (line 472) | def parse_reference_type(self):
method parse_type_arguments (line 491) | def parse_type_arguments(self):
method parse_type_argument (line 508) | def parse_type_argument(self):
method parse_nonwildcard_type_arguments (line 532) | def parse_nonwildcard_type_arguments(self):
method parse_type_list (line 540) | def parse_type_list(self):
method parse_type_arguments_or_diamond (line 561) | def parse_type_arguments_or_diamond(self):
method parse_nonwildcard_type_arguments_or_diamond (line 568) | def parse_nonwildcard_type_arguments_or_diamond(self):
method parse_type_parameters (line 575) | def parse_type_parameters(self):
method parse_type_parameter (line 592) | def parse_type_parameter(self):
method parse_array_dimension (line 610) | def parse_array_dimension(self):
method parse_modifiers (line 622) | def parse_modifiers(self):
method parse_annotations (line 645) | def parse_annotations(self):
method parse_annotation (line 658) | def parse_annotation(self):
method parse_annotation_element (line 674) | def parse_annotation_element(self):
method parse_element_value_pairs (line 681) | def parse_element_value_pairs(self):
method parse_element_value_pair (line 694) | def parse_element_value_pair(self):
method parse_element_value (line 703) | def parse_element_value(self):
method parse_element_value_array_initializer (line 714) | def parse_element_value_array_initializer(self):
method parse_element_values (line 727) | def parse_element_values(self):
method parse_class_body (line 745) | def parse_class_body(self):
method parse_class_body_declaration (line 760) | def parse_class_body_declaration(self):
method parse_member_declaration (line 777) | def parse_member_declaration(self):
method parse_method_or_field_declaraction (line 818) | def parse_method_or_field_declaraction(self):
method parse_method_or_field_rest (line 836) | def parse_method_or_field_rest(self):
method parse_field_declarators_rest (line 845) | def parse_field_declarators_rest(self):
method parse_method_declarator_rest (line 857) | def parse_method_declarator_rest(self):
method parse_void_method_declarator_rest (line 877) | def parse_void_method_declarator_rest(self):
method parse_constructor_declarator_rest (line 895) | def parse_constructor_declarator_rest(self):
method parse_generic_method_or_constructor_declaration (line 910) | def parse_generic_method_or_constructor_declaration(self):
method parse_interface_body (line 940) | def parse_interface_body(self):
method parse_interface_body_declaration (line 954) | def parse_interface_body_declaration(self):
method parse_interface_member_declaration (line 968) | def parse_interface_member_declaration(self):
method parse_interface_method_or_field_declaration (line 991) | def parse_interface_method_or_field_declaration(self):
method parse_interface_method_or_field_rest (line 1007) | def parse_interface_method_or_field_rest(self):
method parse_constant_declarators_rest (line 1019) | def parse_constant_declarators_rest(self):
method parse_constant_declarator_rest (line 1031) | def parse_constant_declarator_rest(self):
method parse_constant_declarator (line 1039) | def parse_constant_declarator(self):
method parse_interface_method_declarator_rest (line 1048) | def parse_interface_method_declarator_rest(self):
method parse_void_interface_method_declarator_rest (line 1068) | def parse_void_interface_method_declarator_rest(self):
method parse_interface_generic_method_declarator (line 1086) | def parse_interface_generic_method_declarator(self):
method parse_formal_parameters (line 1106) | def parse_formal_parameters(self):
method parse_variable_modifiers (line 1145) | def parse_variable_modifiers(self):
method parse_variable_declators (line 1161) | def parse_variable_declators(self):
method parse_variable_declarators (line 1174) | def parse_variable_declarators(self):
method parse_variable_declarator (line 1187) | def parse_variable_declarator(self):
method parse_variable_declarator_rest (line 1196) | def parse_variable_declarator_rest(self):
method parse_variable_initializer (line 1206) | def parse_variable_initializer(self):
method parse_array_initializer (line 1213) | def parse_array_initializer(self):
method parse_block (line 1239) | def parse_block(self):
method parse_block_statement (line 1252) | def parse_block_statement(self):
method parse_local_variable_declaration_statement (line 1319) | def parse_local_variable_declaration_statement(self):
method parse_statement (line 1332) | def parse_statement(self):
method parse_catches (line 1494) | def parse_catches(self):
method parse_catch_clause (line 1507) | def parse_catch_clause(self):
method parse_resource_specification (line 1528) | def parse_resource_specification(self):
method parse_resource (line 1546) | def parse_resource(self):
method parse_switch_block_statement_groups (line 1565) | def parse_switch_block_statement_groups(self):
method parse_switch_block_statement_group (line 1575) | def parse_switch_block_statement_group(self):
method parse_for_control (line 1606) | def parse_for_control(self):
method parse_for_var_control (line 1636) | def parse_for_var_control(self):
method parse_for_var_control_rest (line 1661) | def parse_for_var_control_rest(self):
method parse_for_variable_declarator_rest (line 1685) | def parse_for_variable_declarator_rest(self):
method parse_for_init_or_update (line 1700) | def parse_for_init_or_update(self):
method parse_expression (line 1716) | def parse_expression(self):
method parse_expressionl (line 1731) | def parse_expressionl(self):
method parse_expression_2 (line 1757) | def parse_expression_2(self):
method parse_expression_2_rest (line 1768) | def parse_expression_2_rest(self):
method parse_expression_3 (line 1789) | def parse_expression_3(self):
method parse_method_reference (line 1833) | def parse_method_reference(self):
method parse_lambda_expression (line 1844) | def parse_lambda_expression(self):
method parse_lambda_method_body (line 1862) | def parse_lambda_method_body(self):
method parse_infix_operator (line 1870) | def parse_infix_operator(self):
method parse_primary (line 1888) | def parse_primary(self):
method parse_literal (line 1963) | def parse_literal(self):
method parse_par_expression (line 1968) | def parse_par_expression(self):
method parse_arguments (line 1976) | def parse_arguments(self):
method parse_super_suffix (line 1996) | def parse_super_suffix(self):
method parse_explicit_generic_invocation_suffix (line 2022) | def parse_explicit_generic_invocation_suffix(self):
method parse_creator (line 2037) | def parse_creator(self):
method parse_created_name (line 2066) | def parse_created_name(self):
method parse_class_creator_rest (line 2085) | def parse_class_creator_rest(self):
method parse_array_creator_rest (line 2095) | def parse_array_creator_rest(self):
method parse_identifier_suffix (line 2116) | def parse_identifier_suffix(self):
method parse_explicit_generic_invocation (line 2156) | def parse_explicit_generic_invocation(self):
method parse_inner_creator (line 2165) | def parse_inner_creator(self):
method parse_selector (line 2182) | def parse_selector(self):
method parse_enum_body (line 2228) | def parse_enum_body(self):
method parse_enum_constant (line 2255) | def parse_enum_constant(self):
method parse_annotation_type_body (line 2284) | def parse_annotation_type_body(self):
method parse_annotation_type_element_declarations (line 2294) | def parse_annotation_type_element_declarations(self):
method parse_annotation_type_element_declaration (line 2304) | def parse_annotation_type_element_declaration(self):
method parse_annotation_method_or_constant_rest (line 2336) | def parse_annotation_method_or_constant_rest(self):
function parse (line 2351) | def parse(tokens, debug=False):
FILE: baseline_tokenization/javalang/test/test_java_8_syntax.py
function setup_java_class (line 7) | def setup_java_class(content_to_add):
function filter_type_in_method (line 22) | def filter_type_in_method(clazz, the_type, method_name):
class LambdaSupportTest (line 33) | class LambdaSupportTest(unittest.TestCase):
method assert_contains_lambda_expression_in_m (line 37) | def assert_contains_lambda_expression_in_m(
method test_lambda_support_no_parameters_no_body (line 48) | def test_lambda_support_no_parameters_no_body(self):
method test_lambda_support_no_parameters_expression_body (line 53) | def test_lambda_support_no_parameters_expression_body(self):
method test_lambda_support_no_parameters_complex_expression (line 67) | def test_lambda_support_no_parameters_complex_expression(self):
method test_parameter_no_type_expression_body (line 83) | def test_parameter_no_type_expression_body(self):
method test_parameter_with_type_expression_body (line 95) | def test_parameter_with_type_expression_body(self):
method test_parameters_with_no_type_expression_body (line 109) | def test_parameters_with_no_type_expression_body(self):
method test_parameters_with_mixed_inferred_and_declared_types (line 116) | def test_parameters_with_mixed_inferred_and_declared_types(self):
method test_parameters_inferred_types_with_modifiers (line 123) | def test_parameters_inferred_types_with_modifiers(self):
method test_invalid_parameters_are_invalid (line 130) | def test_invalid_parameters_are_invalid(self):
method test_cast_works (line 137) | def test_cast_works(self):
class MethodReferenceSyntaxTest (line 142) | class MethodReferenceSyntaxTest(unittest.TestCase):
method assert_contains_method_reference_expression_in_m (line 146) | def assert_contains_method_reference_expression_in_m(
method test_method_reference (line 157) | def test_method_reference(self):
method test_method_reference_to_the_new_method (line 162) | def test_method_reference_to_the_new_method(self):
method test_method_reference_to_the_new_method_with_explict_type (line 167) | def test_method_reference_to_the_new_method_with_explict_type(self):
method test_method_reference_from_super (line 174) | def test_method_reference_from_super(self):
method test_method_reference_from_super_with_identifier (line 179) | def test_method_reference_from_super_with_identifier(self):
method test_method_reference_explicit_type_arguments_for_generic_type (line 185) | def test_method_reference_explicit_type_arguments_for_generic_type(self):
method test_method_reference_explicit_type_arguments (line 192) | def test_method_reference_explicit_type_arguments(self):
method test_method_reference_from_array_type (line 199) | def test_method_reference_from_array_type(self):
class InterfaceSupportTest (line 207) | class InterfaceSupportTest(unittest.TestCase):
method test_interface_support_static_methods (line 211) | def test_interface_support_static_methods(self):
method test_interface_support_default_methods (line 227) | def test_interface_support_default_methods(self):
function main (line 237) | def main():
FILE: baseline_tokenization/javalang/test/test_javadoc.py
class TestJavadoc (line 6) | class TestJavadoc(unittest.TestCase):
method test_empty_comment (line 7) | def test_empty_comment(self):
FILE: baseline_tokenization/javalang/test/test_package_declaration.py
class PackageInfo (line 9) | class PackageInfo(unittest.TestCase):
method testPackageDeclarationOnly (line 10) | def testPackageDeclarationOnly(self):
method testAnnotationOnly (line 18) | def testAnnotationOnly(self):
method testJavadocOnly (line 26) | def testJavadocOnly(self):
method testAnnotationThenJavadoc (line 34) | def testAnnotationThenJavadoc(self):
method testJavadocThenAnnotation (line 42) | def testJavadocThenAnnotation(self):
method get_ast (line 50) | def get_ast(self, filename):
function main (line 57) | def main():
FILE: baseline_tokenization/javalang/test/test_util.py
class TestLookAheadIterator (line 6) | class TestLookAheadIterator(unittest.TestCase):
method test_usage (line 7) | def test_usage(self):
FILE: baseline_tokenization/javalang/tokenizer.py
class LexerError (line 7) | class LexerError(Exception):
class JavaToken (line 10) | class JavaToken(object):
method __init__ (line 11) | def __init__(self, value, position=None, javadoc=None):
method __repr__ (line 16) | def __repr__(self):
method __str__ (line 24) | def __str__(self):
method __eq__ (line 27) | def __eq__(self, other):
class EndOfInput (line 30) | class EndOfInput(JavaToken):
class Keyword (line 33) | class Keyword(JavaToken):
class Modifier (line 45) | class Modifier(Keyword):
class BasicType (line 50) | class BasicType(Keyword):
class Literal (line 54) | class Literal(JavaToken):
class Integer (line 57) | class Integer(Literal):
class DecimalInteger (line 60) | class DecimalInteger(Literal):
class OctalInteger (line 63) | class OctalInteger(Integer):
class BinaryInteger (line 66) | class BinaryInteger(Integer):
class HexInteger (line 69) | class HexInteger(Integer):
class FloatingPoint (line 72) | class FloatingPoint(Literal):
class DecimalFloatingPoint (line 75) | class DecimalFloatingPoint(FloatingPoint):
class HexFloatingPoint (line 78) | class HexFloatingPoint(FloatingPoint):
class Boolean (line 81) | class Boolean(Literal):
class Character (line 84) | class Character(Literal):
class String (line 87) | class String(Literal):
class Null (line 90) | class Null(Literal):
class Separator (line 93) | class Separator(JavaToken):
class Operator (line 96) | class Operator(JavaToken):
method is_infix (line 123) | def is_infix(self):
method is_prefix (line 126) | def is_prefix(self):
method is_postfix (line 129) | def is_postfix(self):
method is_assignment (line 132) | def is_assignment(self):
class Annotation (line 136) | class Annotation(JavaToken):
class Identifier (line 139) | class Identifier(JavaToken):
class JavaTokenizer (line 143) | class JavaTokenizer(object):
method __init__ (line 149) | def __init__(self, data):
method reset (line 165) | def reset(self):
method consume_whitespace (line 169) | def consume_whitespace(self):
method read_string (line 186) | def read_string(self):
method try_operator (line 234) | def try_operator(self):
method read_comment (line 241) | def read_comment(self):
method try_javadoc_comment (line 268) | def try_javadoc_comment(self):
method read_decimal_float_or_integer (line 286) | def read_decimal_float_or_integer(self):
method read_hex_integer_or_float (line 314) | def read_hex_integer_or_float(self):
method read_digits (line 344) | def read_digits(self, digits):
method read_decimal_integer (line 362) | def read_decimal_integer(self):
method read_hex_integer (line 366) | def read_hex_integer(self):
method read_bin_integer (line 370) | def read_bin_integer(self):
method read_octal_integer (line 374) | def read_octal_integer(self):
method read_integer_or_float (line 378) | def read_integer_or_float(self, c, c_next):
method try_separator (line 390) | def try_separator(self):
method decode_data (line 396) | def decode_data(self):
method is_java_identifier_start (line 413) | def is_java_identifier_start(self, c):
method read_identifier (line 416) | def read_identifier(self):
method pre_tokenize (line 440) | def pre_tokenize(self):
method tokenize (line 498) | def tokenize(self):
method error (line 569) | def error(self, message, char=None):
function tokenize (line 584) | def tokenize(code):
function reformat_tokens (line 588) | def reformat_tokens(tokens):
FILE: baseline_tokenization/javalang/tree.py
class CompilationUnit (line 6) | class CompilationUnit(Node):
class Import (line 9) | class Import(Node):
class Documented (line 12) | class Documented(Node):
class Declaration (line 15) | class Declaration(Node):
class TypeDeclaration (line 18) | class TypeDeclaration(Declaration, Documented):
method fields (line 22) | def fields(self):
method methods (line 26) | def methods(self):
method constructors (line 30) | def constructors(self):
class PackageDeclaration (line 33) | class PackageDeclaration(Declaration, Documented):
class ClassDeclaration (line 36) | class ClassDeclaration(TypeDeclaration):
class EnumDeclaration (line 39) | class EnumDeclaration(TypeDeclaration):
class InterfaceDeclaration (line 42) | class InterfaceDeclaration(TypeDeclaration):
class AnnotationDeclaration (line 45) | class AnnotationDeclaration(TypeDeclaration):
class Type (line 50) | class Type(Node):
class BasicType (line 53) | class BasicType(Type):
class ReferenceType (line 56) | class ReferenceType(Type):
class TypeArgument (line 59) | class TypeArgument(Node):
class TypeParameter (line 64) | class TypeParameter(Node):
class Annotation (line 69) | class Annotation(Node):
class ElementValuePair (line 72) | class ElementValuePair(Node):
class ElementArrayValue (line 75) | class ElementArrayValue(Node):
class Member (line 80) | class Member(Documented):
class MethodDeclaration (line 83) | class MethodDeclaration(Member, Declaration):
class FieldDeclaration (line 86) | class FieldDeclaration(Member, Declaration):
class ConstructorDeclaration (line 89) | class ConstructorDeclaration(Declaration, Documented):
class ConstantDeclaration (line 94) | class ConstantDeclaration(FieldDeclaration):
class ArrayInitializer (line 97) | class ArrayInitializer(Node):
class VariableDeclaration (line 100) | class VariableDeclaration(Declaration):
class LocalVariableDeclaration (line 103) | class LocalVariableDeclaration(VariableDeclaration):
class VariableDeclarator (line 106) | class VariableDeclarator(Node):
class FormalParameter (line 109) | class FormalParameter(Declaration):
class InferredFormalParameter (line 112) | class InferredFormalParameter(Node):
class Statement (line 117) | class Statement(Node):
class IfStatement (line 120) | class IfStatement(Statement):
class WhileStatement (line 123) | class WhileStatement(Statement):
class DoStatement (line 126) | class DoStatement(Statement):
class ForStatement (line 129) | class ForStatement(Statement):
class AssertStatement (line 132) | class AssertStatement(Statement):
class BreakStatement (line 135) | class BreakStatement(Statement):
class ContinueStatement (line 138) | class ContinueStatement(Statement):
class ReturnStatement (line 141) | class ReturnStatement(Statement):
class ThrowStatement (line 144) | class ThrowStatement(Statement):
class SynchronizedStatement (line 147) | class SynchronizedStatement(Statement):
class TryStatement (line 150) | class TryStatement(Statement):
class SwitchStatement (line 153) | class SwitchStatement(Statement):
class BlockStatement (line 156) | class BlockStatement(Statement):
class StatementExpression (line 159) | class StatementExpression(Statement):
class TryResource (line 164) | class TryResource(Declaration):
class CatchClause (line 167) | class CatchClause(Statement):
class CatchClauseParameter (line 170) | class CatchClauseParameter(Declaration):
class SwitchStatementCase (line 175) | class SwitchStatementCase(Node):
class ForControl (line 178) | class ForControl(Node):
class EnhancedForControl (line 181) | class EnhancedForControl(Node):
class Expression (line 186) | class Expression(Node):
class Assignment (line 189) | class Assignment(Expression):
class TernaryExpression (line 192) | class TernaryExpression(Expression):
class BinaryOperation (line 195) | class BinaryOperation(Expression):
class Cast (line 198) | class Cast(Expression):
class MethodReference (line 201) | class MethodReference(Expression):
class LambdaExpression (line 204) | class LambdaExpression(Expression):
class Primary (line 209) | class Primary(Expression):
class Literal (line 212) | class Literal(Primary):
class This (line 215) | class This(Primary):
class MemberReference (line 218) | class MemberReference(Primary):
class Invocation (line 221) | class Invocation(Primary):
class ExplicitConstructorInvocation (line 224) | class ExplicitConstructorInvocation(Invocation):
class SuperConstructorInvocation (line 227) | class SuperConstructorInvocation(Invocation):
class MethodInvocation (line 230) | class MethodInvocation(Invocation):
class SuperMethodInvocation (line 233) | class SuperMethodInvocation(Invocation):
class SuperMemberReference (line 236) | class SuperMemberReference(Primary):
class ArraySelector (line 239) | class ArraySelector(Expression):
class ClassReference (line 242) | class ClassReference(Primary):
class VoidClassReference (line 245) | class VoidClassReference(ClassReference):
class Creator (line 250) | class Creator(Primary):
class ArrayCreator (line 253) | class ArrayCreator(Creator):
class ClassCreator (line 256) | class ClassCreator(Creator):
class InnerClassCreator (line 259) | class InnerClassCreator(Creator):
class EnumBody (line 264) | class EnumBody(Node):
class EnumConstantDeclaration (line 267) | class EnumConstantDeclaration(Declaration, Documented):
class AnnotationMethod (line 270) | class AnnotationMethod(Declaration):
FILE: baseline_tokenization/javalang/util.py
class LookAheadIterator (line 3) | class LookAheadIterator(object):
method __init__ (line 4) | def __init__(self, iterable):
method __iter__ (line 11) | def __iter__(self):
method set_default (line 14) | def set_default(self, value):
method next (line 17) | def next(self):
method __next__ (line 20) | def __next__(self):
method look (line 31) | def look(self, i=0):
method last (line 52) | def last(self):
method __enter__ (line 55) | def __enter__(self):
method __exit__ (line 59) | def __exit__(self, exc_type, exc_val, exc_tb):
method push_marker (line 66) | def push_marker(self):
method pop_marker (line 70) | def pop_marker(self, reset):
class LookAheadListIterator (line 90) | class LookAheadListIterator(object):
method __init__ (line 91) | def __init__(self, iterable):
method __iter__ (line 100) | def __iter__(self):
method set_default (line 103) | def set_default(self, value):
method next (line 106) | def next(self):
method __next__ (line 109) | def __next__(self):
method look (line 118) | def look(self, i=0):
method last (line 134) | def last(self):
method __enter__ (line 137) | def __enter__(self):
method __exit__ (line 141) | def __exit__(self, exc_type, exc_val, exc_tb):
method push_marker (line 148) | def push_marker(self):
method pop_marker (line 152) | def pop_marker(self, reset):
FILE: baseline_tokenization/subtokenize_nmt_baseline.py
function split_subtokens (line 19) | def split_subtokens(str):
function tokenizeFile (line 22) | def tokenizeFile(file_path):
FILE: common.py
class Common (line 6) | class Common:
method normalize_word (line 14) | def normalize_word(word):
method load_histogram (line 22) | def load_histogram(path, max_size=None):
method load_vocab_from_dict (line 34) | def load_vocab_from_dict(word_to_count, add_values=[], max_size=None):
method binary_to_string (line 50) | def binary_to_string(binary_string):
method binary_to_string_list (line 54) | def binary_to_string_list(binary_string_list):
method binary_to_string_matrix (line 58) | def binary_to_string_matrix(binary_string_matrix):
method binary_to_string_3d (line 62) | def binary_to_string_3d(binary_string_tensor):
method legal_method_names_checker (line 66) | def legal_method_names_checker(name):
method filter_impossible_names (line 70) | def filter_impossible_names(top_words):
method unique (line 75) | def unique(sequence):
method parse_results (line 79) | def parse_results(result, pc_info_dict, topk=5):
method compute_bleu (line 109) | def compute_bleu(ref_file_name, predicted_file_name):
class PredictionResults (line 115) | class PredictionResults:
method __init__ (line 116) | def __init__(self, original_name):
method append_prediction (line 120) | def append_prediction(self, name, current_timestep_paths):
class SingleTimeStepPrediction (line 123) | class SingleTimeStepPrediction:
method __init__ (line 124) | def __init__(self, prediction, attention_paths):
class PathContextInformation (line 137) | class PathContextInformation:
method __init__ (line 138) | def __init__(self, context):
method __str__ (line 144) | def __str__(self):
FILE: config.py
class Config (line 1) | class Config:
method get_default_config (line 3) | def get_default_config(args):
method take_model_hyperparams_from (line 31) | def take_model_hyperparams_from(self, otherConfig):
method __init__ (line 40) | def __init__(self, args):
method get_debug_config (line 73) | def get_debug_config(args):
FILE: extractor.py
class Extractor (line 8) | class Extractor:
method __init__ (line 9) | def __init__(self, config, extractor_api_url, max_path_length, max_pat...
method post_request (line 17) | def post_request(url, code_string):
method extract_paths (line 20) | def extract_paths(self, code_string):
FILE: interactive_predict.py
class InteractivePredictor (line 10) | class InteractivePredictor:
method __init__ (line 13) | def __init__(self, config, model):
method read_file (line 20) | def read_file(input_filename):
method predict (line 24) | def predict(self):
FILE: model.py
class Model (line 14) | class Model:
method __init__ (line 18) | def __init__(self, config):
method close_session (line 59) | def close_session(self):
method train (line 62) | def train(self):
method trace (line 138) | def trace(self, sum_loss, batch_num, multi_batch_start_time):
method evaluate (line 145) | def evaluate(self, release=False):
method update_correct_predictions (line 238) | def update_correct_predictions(self, num_correct_predictions, output_f...
method update_per_subtoken_statistics (line 268) | def update_per_subtoken_statistics(self, results, true_positive, false...
method print_hyperparams (line 289) | def print_hyperparams(self):
method calculate_results (line 311) | def calculate_results(true_positive, false_positive, false_negative):
method trace_evaluation (line 327) | def trace_evaluation(output_file, correct_predictions, total_predictio...
method build_training_graph (line 335) | def build_training_graph(self, input_tensors):
method decode_outputs (line 403) | def decode_outputs(self, target_words_vocab, target_input, batch_size,...
method calculate_path_abstraction (line 470) | def calculate_path_abstraction(self, path_embed, path_lengths, valid_c...
method path_rnn_last_state (line 473) | def path_rnn_last_state(self, is_evaluating, path_embed, path_lengths,...
method compute_contexts (line 513) | def compute_contexts(self, subtoken_vocab, nodes_vocab, source_input, ...
method build_test_graph (line 547) | def build_test_graph(self, input_tensors):
method predict (line 592) | def predict(self, predict_data_lines):
method get_attention_per_path (line 645) | def get_attention_per_path(source_strings, path_strings, target_string...
method save_model (line 657) | def save_model(self, sess, path):
method load_model (line 683) | def load_model(self, sess):
method initialize_session_variables (line 710) | def initialize_session_variables(sess):
method get_should_reuse_variables (line 713) | def get_should_reuse_variables(self):
FILE: preprocess.py
function save_dictionaries (line 14) | def save_dictionaries(dataset_name, subtoken_to_count, node_to_count, ta...
function process_file (line 25) | def process_file(file_path, data_file_role, dataset_name, max_contexts, ...
function context_full_found (line 60) | def context_full_found(context_parts, word_to_count, path_to_count):
function context_partial_found (line 65) | def context_partial_found(context_parts, word_to_count, path_to_count):
FILE: reader.py
class Reader (line 22) | class Reader:
method __init__ (line 27) | def __init__(self, subtoken_to_index, target_to_index, node_to_index, ...
method get_subtoken_table (line 46) | def get_subtoken_table(cls, subtoken_to_index):
method get_target_table (line 52) | def get_target_table(cls, target_to_index):
method get_node_table (line 58) | def get_node_table(cls, node_to_index):
method initialize_hash_map (line 64) | def initialize_hash_map(cls, word_to_index, default_value):
method process_from_placeholder (line 70) | def process_from_placeholder(self, row):
method process_dataset (line 74) | def process_dataset(self, *row_parts):
method reset (line 172) | def reset(self, sess):
method get_output (line 175) | def get_output(self):
method compute_output (line 178) | def compute_output(self):
class Config (line 203) | class Config:
method __init__ (line 204) | def __init__(self):
Condensed preview — 71 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (337K chars).
[
{
"path": ".gitignore",
"chars": 41,
"preview": "*.class\n*.lst\n.idea/*\n*.iml\n*.xml\n*.pyc\n\n"
},
{
"path": "CITATION.cff",
"chars": 333,
"preview": "@inproceedings{\n alon2018codeseq,\n title={code2seq: Generating Sequences from Structured Representations of Code},"
},
{
"path": "CSharpExtractor/.gitattributes",
"chars": 2518,
"preview": "###############################################################################\n# Set default behavior to automatically "
},
{
"path": "CSharpExtractor/.gitignore",
"chars": 4226,
"preview": "## Ignore Visual Studio temporary files, build results, and\n## files generated by popular Visual Studio add-ons.\n\n# User"
},
{
"path": "CSharpExtractor/CSharpExtractor/.nuget/packages.config",
"chars": 516,
"preview": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<packages>\n <package id=\"NUnit.ConsoleRunner\" version=\"3.6.0\" />\n <package id="
},
{
"path": "CSharpExtractor/CSharpExtractor/CSharpExtractor.sln",
"chars": 2430,
"preview": "\nMicrosoft Visual Studio Solution File, Format Version 12.00\n# Visual Studio 15\nVisualStudioVersion = 15.0.28307.136\nMi"
},
{
"path": "CSharpExtractor/CSharpExtractor/Extractor/Extractor.cs",
"chars": 8293,
"preview": "using Extractor.Semantics;\nusing Microsoft.CodeAnalysis;\nusing Microsoft.CodeAnalysis.CSharp;\nusing Microsoft.CodeAnaly"
},
{
"path": "CSharpExtractor/CSharpExtractor/Extractor/Extractor.csproj",
"chars": 521,
"preview": "<Project Sdk=\"Microsoft.NET.Sdk\">\n\n <PropertyGroup>\n <OutputType>Exe</OutputType>\n <TargetFramework>netcoreapp2.2"
},
{
"path": "CSharpExtractor/CSharpExtractor/Extractor/PathFinder.cs",
"chars": 2554,
"preview": "using Microsoft.CodeAnalysis;\nusing Microsoft.CodeAnalysis.CSharp.Syntax;\nusing System;\nusing System.Collections.Generic"
},
{
"path": "CSharpExtractor/CSharpExtractor/Extractor/Program.cs",
"chars": 1586,
"preview": "using CommandLine;\nusing CommandLine.Text;\nusing System;\nusing System.Collections.Generic;\nusing System.IO;\nusing Syste"
},
{
"path": "CSharpExtractor/CSharpExtractor/Extractor/Properties/launchSettings.json",
"chars": 208,
"preview": "{\n \"profiles\": {\n \"Extractor\": {\n \"commandName\": \"Project\",\n \"commandLineArgs\": \"--path C:\\\\Users\\\\urial\\\\"
},
{
"path": "CSharpExtractor/CSharpExtractor/Extractor/Temp.cs",
"chars": 177,
"preview": "namespace Extractor\n{\n class Temp\n {\n class NestedClass\n {\n void fooBar()\n {\n"
},
{
"path": "CSharpExtractor/CSharpExtractor/Extractor/Tree/Tree.cs",
"chars": 9763,
"preview": "using System;\nusing System.Collections.Generic;\nusing System.IO;\nusing System.Linq;\nusing System.Text;\nusing Microsoft."
},
{
"path": "CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs",
"chars": 5489,
"preview": "using CommandLine;\nusing System;\nusing System.Collections.Generic;\nusing System.Linq;\nusing System.Text;\nusing System.D"
},
{
"path": "CSharpExtractor/CSharpExtractor/Extractor/Variable.cs",
"chars": 2986,
"preview": "using System;\nusing System.Collections.Generic;\nusing System.Linq;\nusing Microsoft.CodeAnalysis;\nusing Microsoft.CodeAn"
},
{
"path": "CSharpExtractor/extract.py",
"chars": 3082,
"preview": "#!/usr/bin/python\n\nimport itertools\nimport multiprocessing\nimport os\nimport sys\nimport shutil\nimport subprocess\nfrom thr"
},
{
"path": "Input.java",
"chars": 43,
"preview": "public String getName() {\n\t\treturn name;\n\t}"
},
{
"path": "JavaExtractor/JPredict/.classpath",
"chars": 1097,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<classpath>\n\t<classpathentry excluding=\"Test.java\" kind=\"src\" output=\"target/clas"
},
{
"path": "JavaExtractor/JPredict/.gitignore",
"chars": 8,
"preview": "/target/"
},
{
"path": "JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java",
"chars": 2209,
"preview": "package JavaExtractor;\n\nimport JavaExtractor.Common.CommandLineValues;\nimport org.kohsuke.args4j.CmdLineException;\n\nimpo"
},
{
"path": "JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java",
"chars": 1634,
"preview": "package JavaExtractor.Common;\n\nimport org.kohsuke.args4j.CmdLineException;\nimport org.kohsuke.args4j.CmdLineParser;\nimpo"
},
{
"path": "JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java",
"chars": 2485,
"preview": "package JavaExtractor.Common;\n\nimport JavaExtractor.FeaturesEntities.Property;\nimport com.github.javaparser.ast.Node;\nim"
},
{
"path": "JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java",
"chars": 607,
"preview": "package JavaExtractor.Common;\n\nimport com.github.javaparser.ast.Node;\n\nimport java.util.ArrayList;\n\npublic class MethodC"
},
{
"path": "JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java",
"chars": 2843,
"preview": "package JavaExtractor;\n\nimport JavaExtractor.Common.CommandLineValues;\nimport JavaExtractor.Common.Common;\nimport JavaEx"
},
{
"path": "JavaExtractor/JPredict/src/main/java/JavaExtractor/FeatureExtractor.java",
"chars": 7995,
"preview": "package JavaExtractor;\n\nimport JavaExtractor.Common.CommandLineValues;\nimport JavaExtractor.Common.Common;\nimport JavaEx"
},
{
"path": "JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java",
"chars": 1157,
"preview": "package JavaExtractor.FeaturesEntities;\n\nimport java.nio.file.Path;\nimport java.util.ArrayList;\nimport java.util.stream."
},
{
"path": "JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java",
"chars": 446,
"preview": "package JavaExtractor.FeaturesEntities;\n\npublic class ProgramRelation {\n Property source;\n Property target;\n St"
},
{
"path": "JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/Property.java",
"chars": 8346,
"preview": "package JavaExtractor.FeaturesEntities;\n\nimport JavaExtractor.Common.Common;\nimport com.github.javaparser.ast.Node;\nimpo"
},
{
"path": "JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java",
"chars": 2703,
"preview": "package JavaExtractor.Visitors;\n\nimport JavaExtractor.Common.CommandLineValues;\nimport JavaExtractor.Common.Common;\nimpo"
},
{
"path": "JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java",
"chars": 2268,
"preview": "package JavaExtractor.Visitors;\n\nimport JavaExtractor.Common.Common;\nimport JavaExtractor.FeaturesEntities.Property;\nimp"
},
{
"path": "JavaExtractor/JPredict/src/main/java/Test.java",
"chars": 89,
"preview": "class Test {\n void fooBar() {\n System.out.println(\"http://github.com\");\n }\n}"
},
{
"path": "JavaExtractor/extract.py",
"chars": 3450,
"preview": "#!/usr/bin/python\n\nimport itertools\nimport multiprocessing\nimport os\nimport shutil\nimport subprocess\nimport sys\nfrom arg"
},
{
"path": "LICENSE",
"chars": 1065,
"preview": "MIT License\n\nCopyright (c) 2019 Technion\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\no"
},
{
"path": "Python150kExtractor/README.md",
"chars": 1608,
"preview": "# Python150k dataset\n\n## Steps to reproduce\n\n1. Download parsed python dataset from [here](https://www.sri.inf.ethz.ch/p"
},
{
"path": "Python150kExtractor/extract.py",
"chars": 5382,
"preview": "import argparse\nimport re\nimport json\nimport multiprocessing\nimport itertools\nimport tqdm\nimport joblib\nimport numpy as "
},
{
"path": "Python150kExtractor/preprocess.sh",
"chars": 1587,
"preview": "#!/usr/bin/env bash\n\nMAX_CONTEXTS=200\nMAX_DATA_CONTEXTS=1000\nSUBTOKEN_VOCAB_SIZE=186277\nTARGET_VOCAB_SIZE=26347\n\ndata_di"
},
{
"path": "README.md",
"chars": 16413,
"preview": "# code2seq\nThis is an official implementation of the model described in:\n\n[Uri Alon](http://urialon.cswp.cs.technion.ac."
},
{
"path": "__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "baseline_tokenization/input_example.txt",
"chars": 583,
"preview": "requires landscape|boolean (){ return false; }\nget parent key|Object (){ return new ContactsUiKey(); }\nget parent key|Ob"
},
{
"path": "baseline_tokenization/javalang/__init__.py",
"chars": 113,
"preview": "\nfrom . import parser\nfrom . import parse\nfrom . import tokenizer\nfrom . import javadoc\n\n\n__version__ = \"0.10.1\"\n"
},
{
"path": "baseline_tokenization/javalang/ast.py",
"chars": 1807,
"preview": "import pickle\n\nimport six\n\n\nclass MetaNode(type):\n def __new__(mcs, name, bases, dict):\n attrs = list(dict['at"
},
{
"path": "baseline_tokenization/javalang/javadoc.py",
"chars": 2855,
"preview": "\nimport re\n\ndef join(s):\n return ' '.join(l.strip() for l in s.split('\\n'))\n\nclass DocBlock(object):\n def __init__"
},
{
"path": "baseline_tokenization/javalang/parse.py",
"chars": 1123,
"preview": "\nfrom .parser import Parser\nfrom .tokenizer import tokenize\n\ndef parse_expression(exp):\n if not exp.endswith(';'):\n "
},
{
"path": "baseline_tokenization/javalang/parser.py",
"chars": 73685,
"preview": "import six\n\nfrom . import util\nfrom . import tree\nfrom .tokenizer import (\n EndOfInput, Keyword, Modifier, BasicType,"
},
{
"path": "baseline_tokenization/javalang/test/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "baseline_tokenization/javalang/test/source/package-info/AnnotationJavadoc.java",
"chars": 95,
"preview": "@Package\n/**\n Test that includes java doc first but no annotation\n*/\npackage org.javalang.test;"
},
{
"path": "baseline_tokenization/javalang/test/source/package-info/AnnotationOnly.java",
"chars": 35,
"preview": "@Package\npackage org.javalang.test;"
},
{
"path": "baseline_tokenization/javalang/test/source/package-info/JavadocAnnotation.java",
"chars": 95,
"preview": "/**\n Test that includes java doc first but no annotation\n*/\n@Package\npackage org.javalang.test;"
},
{
"path": "baseline_tokenization/javalang/test/source/package-info/JavadocOnly.java",
"chars": 86,
"preview": "/**\n Test that includes java doc first but no annotation\n*/\npackage org.javalang.test;"
},
{
"path": "baseline_tokenization/javalang/test/source/package-info/NoAnnotationNoJavadoc.java",
"chars": 26,
"preview": "package org.javalang.test;"
},
{
"path": "baseline_tokenization/javalang/test/test_java_8_syntax.py",
"chars": 8702,
"preview": "import unittest\n\nfrom pkg_resources import resource_string\nfrom .. import parse, parser, tree\n\n\ndef setup_java_class(con"
},
{
"path": "baseline_tokenization/javalang/test/test_javadoc.py",
"chars": 305,
"preview": "import unittest\n\nfrom .. import javadoc\n\n\nclass TestJavadoc(unittest.TestCase):\n def test_empty_comment(self):\n "
},
{
"path": "baseline_tokenization/javalang/test/test_package_declaration.py",
"chars": 2032,
"preview": "import unittest\n\nfrom pkg_resources import resource_string\nfrom .. import parse\n\n\n# From my reading of the spec (http://"
},
{
"path": "baseline_tokenization/javalang/test/test_util.py",
"chars": 1882,
"preview": "import unittest\n\nfrom ..util import LookAheadIterator\n\n\nclass TestLookAheadIterator(unittest.TestCase):\n def test_usa"
},
{
"path": "baseline_tokenization/javalang/tokenizer.py",
"chars": 17036,
"preview": "import re\nimport unicodedata\n\nimport six\n\n\nclass LexerError(Exception):\n pass\n\nclass JavaToken(object):\n def __ini"
},
{
"path": "baseline_tokenization/javalang/tree.py",
"chars": 6844,
"preview": "\nfrom .ast import Node\n\n# ------------------------------------------------------------------------------\n\nclass Compilat"
},
{
"path": "baseline_tokenization/javalang/util.py",
"chars": 4208,
"preview": "\n\nclass LookAheadIterator(object):\n def __init__(self, iterable):\n self.iterable = iter(iterable)\n self"
},
{
"path": "baseline_tokenization/subtokenize_nmt_baseline.py",
"chars": 1591,
"preview": "#!/usr/bin/python\n\nimport javalang\nimport sys\nimport re\n\n\nmodifiers = ['public', 'private', 'protected', 'static']\n\nRE_W"
},
{
"path": "code2seq.py",
"chars": 2015,
"preview": "from argparse import ArgumentParser\nimport numpy as np\nimport tensorflow as tf\n\nfrom config import Config\nfrom interacti"
},
{
"path": "common.py",
"chars": 5842,
"preview": "import re\nimport subprocess\nimport sys\n\n\nclass Common:\n internal_delimiter = '|'\n SOS = '<S>'\n EOS = '</S>'\n "
},
{
"path": "config.py",
"chars": 3620,
"preview": "class Config:\n @staticmethod\n def get_default_config(args):\n config = Config(args)\n config.NUM_EPOCH"
},
{
"path": "extractor.py",
"chars": 1690,
"preview": "import json\n\nimport requests\n\nfrom common import PathContextInformation\n\n\nclass Extractor:\n def __init__(self, config"
},
{
"path": "interactive_predict.py",
"chars": 2506,
"preview": "from common import Common\nfrom extractor import Extractor\n\nSHOW_TOP_CONTEXTS = 10\nMAX_PATH_LENGTH = 8\nMAX_PATH_WIDTH = 2"
},
{
"path": "model.py",
"chars": 41708,
"preview": "import _pickle as pickle\nimport os\nimport time\n\nimport numpy as np\nimport shutil\nimport tensorflow as tf\n\nimport reader\n"
},
{
"path": "preprocess.py",
"chars": 6160,
"preview": "import pickle\nfrom argparse import ArgumentParser\n\nimport numpy as np\n\nimport common\n\n'''\nThis script preprocesses the d"
},
{
"path": "preprocess.sh",
"chars": 4207,
"preview": "#!/usr/bin/env bash\n###########################################################\n# Change the following values to preproc"
},
{
"path": "preprocess_csharp.sh",
"chars": 4505,
"preview": "#!/usr/bin/env bash\n###########################################################\n# Change the following values to preproc"
},
{
"path": "reader.py",
"chars": 15445,
"preview": "import os\n\nimport tensorflow as tf\n\nfrom common import Common\n\nTARGET_INDEX_KEY = 'TARGET_INDEX_KEY'\nTARGET_STRING_KEY ="
},
{
"path": "train.sh",
"chars": 770,
"preview": "###########################################################\n# Change the following values to train a new model.\n# type: "
},
{
"path": "train_python150k.sh",
"chars": 427,
"preview": "#!/usr/bin/env bash\n\ndata_dir=$1\ndata_name=$(basename \"${data_dir}\")\ndata=${data_dir}/${data_name}\ntest=${data_dir}/${da"
}
]
// ... and 1 more files (download for full content)
About this extraction
This page contains the full source code of the tech-srl/code2seq GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 71 files (312.7 KB), approximately 70.2k tokens, and a symbol index with 557 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.